From d1429ac4a55a2f6cbaeaf1cca572601e5d344667 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 15 Nov 2018 19:46:22 +0800 Subject: [PATCH 001/113] add recordio support --- CMakeLists.txt | 6 +- cmake/external/eigen.cmake | 10 +-- cmake/external/gflags.cmake | 5 +- cmake/external/glog.cmake | 3 +- cmake/external/gtest.cmake | 5 +- cmake/external/protobuf.cmake | 5 +- cmake/external/snappy.cmake | 12 +++- cmake/external/snappystream.cmake | 61 +++++++++++-------- cmake/external/zlib.cmake | 5 +- paddle/fluid/CMakeLists.txt | 6 +- paddle/fluid/framework/CMakeLists.txt | 6 +- paddle/fluid/operators/CMakeLists.txt | 8 +-- .../operators/reader/create_py_reader_op.cc | 2 +- paddle/fluid/operators/roi_align_op.cc | 6 +- paddle/fluid/operators/roi_pool_op.cc | 6 +- paddle/fluid/operators/space_to_depth_op.cc | 2 +- paddle/fluid/platform/port.h | 10 +-- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/pybind.cc | 9 +-- python/paddle/fluid/layers/nn.py | 6 ++ 20 files changed, 97 insertions(+), 81 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 996a79fbbc..d6e7b88f86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,11 +190,11 @@ include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) include(external/xxhash) # download xxhash - -if (NOT WIN32) -# there is no official support of snappystream, warpctc, nccl, cupti in windows include(external/snappy) # download snappy include(external/snappystream) # download snappystream + +if (NOT WIN32) +# there is no official support of warpctc, nccl, cupti in windows include(external/warpctc) # download, build, install warpctc include(cupti) endif (NOT WIN32) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f0..98079678ae 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -16,8 +16,9 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 +# GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" +# GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -29,10 +30,11 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" +# GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen - GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c +# GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} DOWNLOAD_NAME "eigen" UPDATE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 4e98e4bf88..7c062d682c 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,8 +28,9 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a +# GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gflags.git" +# GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 8cd0455c16..a3f3c6adf3 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,13 +34,14 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() + SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git") ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY ${GLOG_REPOSITORY} - GIT_TAG ${GLOG_TAG} + # GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index d335298742..da539d52bd 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,8 +43,9 @@ IF(WITH_TESTING) extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} - GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_TAG "release-1.8.0" + # GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gtest.git" +# GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index e1e619e572..94d8ac30cc 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -202,8 +202,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") ENDIF() - SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") - SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git) IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index af09ed4d5d..b30403d2d8 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) -set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +if (WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) ExternalProject_Add( extern_snappy @@ -34,8 +38,12 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake index 6df636d7fa..1ec79462c1 100644 --- a/cmake/external/snappystream.cmake +++ b/cmake/external/snappystream.cmake @@ -18,36 +18,45 @@ ENDIF() include (ExternalProject) -# NOTE: snappy is needed when linking with recordio - set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE) -set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") - -ExternalProject_Add( - extern_snappystream - GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" - GIT_TAG "0.2.8" - PREFIX ${SNAPPYSTREAM_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - DEPENDS snappy -) +if(WIN32) + # Fix me, VS2015 come without VLA support + set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib") + MESSAGE(WARNING, "In windows, snappystream has no compile support for windows, + please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR}) +else(WIN32) + set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") + + ExternalProject_Add( + extern_snappystream + GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" + GIT_TAG "0.2.8" + PREFIX ${SNAPPYSTREAM_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + DEPENDS snappy + ) +endif(WIN32) add_library(snappystream STATIC IMPORTED GLOBAL) set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES}) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d7323545..456f26385c 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -31,8 +31,9 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_TAG "v1.2.8" + # GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/zlib.git" +# GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index abadda3adb..6b526f0103 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -3,13 +3,9 @@ add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(string) - -add_subdirectory(pybind) -if (NOT WIN32) add_subdirectory(recordio) -endif(NOT WIN32) +add_subdirectory(pybind) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) - add_subdirectory(train) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cb9057672c..42af482f85 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -68,11 +68,7 @@ if(WITH_GPU) else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() -if (NOT WIN32) - cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) -else() - cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) -endif (NOT WIN32) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index f06ef199d1..edd062e175 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -95,7 +95,8 @@ function(op_library TARGET) foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op" - "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op") + "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op" + ) if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -225,7 +226,6 @@ if(WITH_DISTRIBUTE) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY}) - find_library(RDMACM_LIBRARY NAMES rdmacm) ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY}) @@ -338,11 +338,7 @@ foreach(src ${GENERAL_OPS}) endforeach() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") - - -if (NOT WIN32) add_subdirectory(reader) -endif(NOT WIN32) foreach(src ${READER_LIBRARY}) set(OP_LIBRARY ${src} ${OP_LIBRARY}) endforeach() diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 0f31ca1a94..901a92ab5b 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -74,7 +74,7 @@ class CreatePyReaderOpMaker : public FileReaderMakerBase { "Name of the `LoDTensorBlockingQueueHolder` variable"); AddComment(R"DOC( - Create PyReader to support LoDTensor data feeding in Python side. + Create PyReader to support LoDTensor data feeding in Python side. )DOC"); } }; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index c57a34c3a7..79f189222e 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel { "The format of input tensor is NCHW."); PADDLE_ENFORCE(rois_dims.size() == 2, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); PADDLE_ENFORCE(rois_dims[1] == 4, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); float spatial_scale = ctx->Attrs().Get("spatial_scale"); @@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]. " + "given as [[x1, y1, x2, y2], ...]. " "(x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates."); AddOutput("Out", diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 043ea680d1..3f6b2e46c7 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel { "The format of input tensor is NCHW."); PADDLE_ENFORCE(rois_dims.size() == 2, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); PADDLE_ENFORCE(rois_dims[1] == kROISize, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); @@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]. " + "given as [[x1, y1, x2, y2], ...]. " "Where batch_id is the id of the data, " "(x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates."); diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc index f109dd685c..b579244673 100644 --- a/paddle/fluid/operators/space_to_depth_op.cc +++ b/paddle/fluid/operators/space_to_depth_op.cc @@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker { .GreaterThan(1); AddComment(R"DOC( reorg operator used in Yolo v2. - The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, + The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize, Reshape Input(X) into the shape according to Attr(blocksize). The data in Input(X) are unchanged. diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 8823e97b0b..a07b993c8a 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -132,10 +132,12 @@ static void MkDir(const char *path) { } } #else - CreateDirectory(path, NULL); - auto errorno = GetLastError(); - if (errorno != ERROR_ALREADY_EXISTS) { - throw std::runtime_error(path_error); + BOOL return_value = CreateDirectory(path, NULL); + if (!return_value) { + auto errorno = GetLastError(); + if (errorno != ERROR_ALREADY_EXISTS) { + throw std::runtime_error(path_error); + } } #endif // !_WIN32 } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 6afa53cd36..cd8256f1c7 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,8 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) - list(APPEND PYBIND_SRCS recordio.cc) endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0d059d8aea..89959c389f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -357,19 +357,16 @@ All parameter, weight, gradient are variables in Paddle. return self.GetMutable(); }, py::return_value_policy::reference) +#endif .def("get_reader", [](Variable &self) -> framework::ReaderHolder * { PADDLE_ENFORCE(self.IsType()); return self.GetMutable(); }, - py::return_value_policy::reference) -#endif - ; + py::return_value_policy::reference); -#if !defined(_WIN32) py::class_(m, "Reader", "") .def("reset", &framework::ReaderHolder::ResetAll); -#endif using LoDTensorBlockingQueue = ::paddle::operators::reader::LoDTensorBlockingQueue; @@ -914,9 +911,9 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(fetch_tensors, fetched_var_name); }); +#endif BindRecordIOWriter(&m); -#endif return m.ptr(); } } // namespace pybind diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1b5009e761..2971319141 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,6 +169,12 @@ __all__ = [ 'bilinear_tensor_product', ] +# To avoid the api checker complains +if os.name == 'nt': + __all__.remove('dynamic_lstm') + __all__.remove('crf_decoding') + __all__.remove('roi_pool') + def fc(input, size, From 162f2d410912ebbe6dae12c4120d97ea69b9ffda Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 10:58:48 +0800 Subject: [PATCH 002/113] disable the openblas multi-thread on windows since no support adjust the python script --- paddle/fluid/platform/cpu_helper.cc | 6 + paddle/fluid/platform/init.cc | 7 - python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/contrib/inferencer.py | 4 +- python/paddle/fluid/contrib/trainer.py | 3 +- python/paddle/fluid/parallel_executor.py | 495 +++++++++++----------- 6 files changed, 258 insertions(+), 260 deletions(-) diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 234a04b5c2..4e52e8ff00 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -29,6 +29,12 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS + // windows has no support for openblas multi-thread +#ifdef _WIN32 + if (num_threads > 1) { + num_threads = 1; + } +#endif int real_num_threads = num_threads > 1 ? num_threads : 1; openblas_set_num_threads(real_num_threads); #elif defined(PADDLE_WITH_MKLML) diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 84d1b852cb..69bbe8794d 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -113,13 +113,6 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); -// windows has no support for openblas multi-thread -#ifdef _WIN32 - if (FLAGS_paddle_num_threads > 1) { - FLAGS_paddle_num_threads = 1; - } -#endif - #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8129918916..dbe49c98bd 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -47,7 +47,8 @@ from . import profiler from . import unique_name from . import recordio_writer from . import parallel_executor -from .parallel_executor import * +if os.name != 'nt': + from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py index b966ae01d0..b8d5f4ffea 100644 --- a/python/paddle/fluid/contrib/inferencer.py +++ b/python/paddle/fluid/contrib/inferencer.py @@ -15,15 +15,13 @@ from __future__ import print_function import contextlib -import os from .. import core from .. import executor from .. import framework from .. import io -if os.name != 'nt': - from .. import parallel_executor +from .. import parallel_executor from .. import unique_name from .trainer import check_and_get_place diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py index 096821a5ba..8569e486f9 100644 --- a/python/paddle/fluid/contrib/trainer.py +++ b/python/paddle/fluid/contrib/trainer.py @@ -28,8 +28,7 @@ from .. import framework from .. import io # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module from .. import optimizer as opt_module -if os.name != 'nt': - from .. import parallel_executor +from .. import parallel_executor from ..transpiler import distribute_transpiler __all__ = [ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 3f4dd5eb71..33f6df67a4 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -25,263 +25,264 @@ import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] -ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy -BuildStrategy = core.ParallelExecutor.BuildStrategy - - -class ParallelExecutor(object): - """ - ParallelExecutor is designed for data parallelism, which focuses on distributing - the data across different nodes and every node operates on the data in parallel. - If you use ParallelExecutor to run the current program on GPU, the node means GPU - device, and ParallelExecutor will get the available GPU device automatically on - the current machine. If you use ParallelExecutor to run the current program on CPU, - the node means the CPU device, and you can specify the CPU device number by adding - 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable - is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number - of CPUs in the system. - - Args: - use_cuda (bool): Whether to use CUDA or not. - loss_name (str): The loss name must set in training. Default None. - main_program (Program): The program that need to run, if not provided, - then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provide, it will share variables - from the specified ParallelExecutor. Default None. - exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run - the program in ParallelExecutor, for example how many threads are used to - execute the program, how many iterations to clean up the temp variables - which is generated during execution. For more information, please refer - to fluid.ExecutionStrategy. Default None. - build_strategy(BuildStrategy): build_strategy is used to control how to - build the SSA Graph in ParallelExecutor by setting the property, - for example reduce_strategy, gradient_scale_strategy. For more information, - please refer to fluid.BuildStrategy. Default None. - num_trainers(int): If greater than 1, NCCL will be initialized with - multiple rank of nodes, each node should have same number of GPUs. - Distributed training will be enabled then. Default 1. - trainer_id(int): Must use together with num_trainers. trainer_id is the - "rank" of current node starts from 0. Default 0. - scope(Scope): scope to run with, default use fluid.global_scope(). - - Returns: - ParallelExecutor: The initialized ParallelExecutor object. - - Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor object. - - Examples: - .. code-block:: python - - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) - """ - - def __init__(self, - use_cuda, - loss_name=None, - main_program=None, - share_vars_from=None, - exec_strategy=None, - build_strategy=None, - num_trainers=1, - trainer_id=0, - scope=None): - self._places = [] - self._act_places = [] - if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) - assert self._places, "no place for execution" - - if exec_strategy is None: - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_cuda - - if exec_strategy.num_threads == 0: - if use_cuda: - # Experiments on se-resnext shows that too many threads hurt - # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num * 2 - - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - - if build_strategy is None: - build_strategy = BuildStrategy() - - main = main_program - main = main if main else framework.default_main_program() - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ - var for var in main.list_vars() - if var.persistable and var.type != core.VarDesc.VarType.RAW - ] - ] - - self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) - self.scope = scope - - def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): - """ - Run a parallel executor with fetch_list. - - The feed parameter can be a dict or a list. If feed is a dict, the - feed data will be split into multiple devices. If feed is a list, we - assume the data has been splitted into multiple devices, the each - element in the list will be copied to each device directly. - - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) +if os.name != 'nt': + ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy + BuildStrategy = core.ParallelExecutor.BuildStrategy - For example, if the feed is a list: - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) + class ParallelExecutor(object): + """ + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. Args: - fetch_list(list): The fetched variable names - feed(list|dict|None): The feed variables. If the feed is a dict, - tensors in that dict will be splitted into each devices. If - the feed is a list, each element of the list will be copied - to each device. Default None. - feed_dict: Alias for feed parameter, for backward compatibility. - This parameter has been deprecated. Default None. - return_numpy(bool): Whether converts the fetched tensor to numpy. - Default: True. + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provide, it will share variables + from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int): Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + scope(Scope): scope to run with, default use fluid.global_scope(). Returns: - List: The fetched result list. + ParallelExecutor: The initialized ParallelExecutor object. Raises: - ValueError: If the feed is a list, but its length is not equal the - length of active places, or its element's is not dict. - - NOTES: - 1. If the feed's type is dict, the number of data that feeds to - ParallelExecutor must be bigger than active places. Otherwise, - it will throw exception from C++ side. Special attention should be - paid to check whether the last batch of the dataset is bigger - than active places. - 2. If active places are more than one, the fetch results for each - variable is a list, and each element of this list is the variable of - respective active place. + TypeError: If share_vars_from is provided, but not ParallelExecutor object. Examples: .. code-block:: python - pe = fluid.ParallelExecutor(use_cuda=use_cuda, - loss_name=avg_cost.name, - main_program=fluid.default_main_program()) - loss = pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name])) + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) """ - if feed is None and feed_dict is not None: - feed = feed_dict - print( - "`feed_dict` is deprecated. Please use `feed=`", - file=sys.stderr) - - if isinstance(feed, dict): - feed_tensor_dict = dict() - for feed_name in feed: - feed_tensor = feed[feed_name] - if not isinstance(feed_tensor, core.LoDTensor): - feed_tensor = core.LoDTensor() - # always set to CPU place, since the tensor need to be splitted - # it is fast in CPU - feed_tensor.set(feed[feed_name], core.CPUPlace()) - feed_tensor_dict[feed_name] = feed_tensor - - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) - elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): - raise ValueError( - "Feed a list of tensor, the list should be the same size as places" - ) - - res = list() - - for i, each in enumerate(feed): - if not isinstance(each, dict): - raise TypeError( - "Each element of feed list should be a dict") - res_dict = dict() - for feed_name in each: - tensor = each[feed_name] - if not isinstance(tensor, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) - tensor = tmp - res_dict[feed_name] = tensor - res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) - - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) - arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - - if return_numpy: - return executor.as_numpy(arr) - - return [arr[i] for i in range(len(arr))] - - @property - def device_count(self): - return len(self._act_places) + + def __init__(self, + use_cuda, + loss_name=None, + main_program=None, + share_vars_from=None, + exec_strategy=None, + build_strategy=None, + num_trainers=1, + trainer_id=0, + scope=None): + self._places = [] + self._act_places = [] + if use_cuda: + for i in six.moves.range(core.get_cuda_device_count()): + p = core.Place() + self._act_places.append(core.CUDAPlace(i)) + p.set_place(self._act_places[-1]) + self._places.append(p) + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in six.moves.range(cpu_num): + p = core.Place() + self._act_places.append(core.CPUPlace()) + p.set_place(self._act_places[-1]) + self._places.append(p) + assert self._places, "no place for execution" + + if exec_strategy is None: + exec_strategy = ExecutionStrategy() + exec_strategy.use_cuda = use_cuda + + if exec_strategy.num_threads == 0: + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num * 2 + + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + + if build_strategy is None: + build_strategy = BuildStrategy() + + main = main_program + main = main if main else framework.default_main_program() + if scope == None: + scope = executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + self.persistable_vars = [ + v.name for v in [ + var for var in main.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ] + + self.executor = core.ParallelExecutor( + self._places, + set([ + cpt.to_text(p.name) + for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + cpt.to_text(loss_name) + if loss_name else six.u(''), scope, local_scopes, exec_strategy, + build_strategy, num_trainers, trainer_id) + self.scope = scope + + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): + """ + Run a parallel executor with fetch_list. + + The feed parameter can be a dict or a list. If feed is a dict, the + feed data will be split into multiple devices. If feed is a list, we + assume the data has been splitted into multiple devices, the each + element in the list will be copied to each device directly. + + For example, if the feed is a dict: + + >>> exe = ParallelExecutor() + >>> # the image will be splitted into devices. If there is two devices + >>> # each device will process an image with shape (24, 1, 28, 28) + >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) + + For example, if the feed is a list: + + >>> exe = ParallelExecutor() + >>> # each device will process each element in the list. + >>> # the 1st device will process an image with shape (48, 1, 28, 28) + >>> # the 2nd device will process an image with shape (32, 1, 28, 28) + >>> # + >>> # you can use exe.device_count to get the device number. + >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, + >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, + >>> ]) + + Args: + fetch_list(list): The fetched variable names + feed(list|dict|None): The feed variables. If the feed is a dict, + tensors in that dict will be splitted into each devices. If + the feed is a list, each element of the list will be copied + to each device. Default None. + feed_dict: Alias for feed parameter, for backward compatibility. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. + + Returns: + List: The fetched result list. + + Raises: + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. + + Examples: + .. code-block:: python + + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) + """ + if feed is None and feed_dict is not None: + feed = feed_dict + print( + "`feed_dict` is deprecated. Please use `feed=`", + file=sys.stderr) + + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._act_places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._act_places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + self.executor.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return executor.as_numpy(arr) + + return [arr[i] for i in range(len(arr))] + + @property + def device_count(self): + return len(self._act_places) From d1a1fafc4c933e51341004b83f225d786c3fed49 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 11:21:29 +0800 Subject: [PATCH 003/113] code style --- paddle/fluid/platform/cpu_helper.cc | 2 +- python/paddle/fluid/parallel_executor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 4e52e8ff00..bd6aedb3ac 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -29,7 +29,7 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS - // windows has no support for openblas multi-thread +// windows has no support for openblas multi-thread #ifdef _WIN32 if (num_threads > 1) { num_threads = 1; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33f6df67a4..0d53f53a9e 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,7 +29,6 @@ if os.name != 'nt': ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy - class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -161,7 +160,8 @@ if os.name != 'nt': for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + set(cpt.to_text(var) + for var in self.persistable_vars), main.desc, cpt.to_text(loss_name) if loss_name else six.u(''), scope, local_scopes, exec_strategy, build_strategy, num_trainers, trainer_id) From dc80be275db8c1a75d222b0da11aba4c92c29aa8 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 11:21:29 +0800 Subject: [PATCH 004/113] code style test=develop --- cmake/external/eigen.cmake | 10 ++++------ cmake/external/gflags.cmake | 5 ++--- cmake/external/glog.cmake | 3 +-- cmake/external/gtest.cmake | 5 ++--- cmake/external/protobuf.cmake | 5 ++--- cmake/external/zlib.cmake | 5 ++--- paddle/fluid/platform/cpu_helper.cc | 2 +- python/paddle/fluid/parallel_executor.py | 4 ++-- 8 files changed, 16 insertions(+), 23 deletions(-) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 98079678ae..573ad5e5f0 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -16,9 +16,8 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" -# GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" + GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" + GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -30,11 +29,10 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" + GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen -# GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c + GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} DOWNLOAD_NAME "eigen" UPDATE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 7c062d682c..4e98e4bf88 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,9 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gflags.git" -# GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a + GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index a3f3c6adf3..8cd0455c16 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,14 +34,13 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() - SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git") ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY ${GLOG_REPOSITORY} - # GIT_TAG ${GLOG_TAG} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index da539d52bd..d335298742 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,9 +43,8 @@ IF(WITH_TESTING) extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} - # GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gtest.git" -# GIT_TAG "release-1.8.0" + GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 94d8ac30cc..e1e619e572 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -202,9 +202,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") ENDIF() - # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") - # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") - SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git) + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 456f26385c..c3d7323545 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -31,9 +31,8 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/zlib.git" -# GIT_TAG "v1.2.8" + GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 4e52e8ff00..bd6aedb3ac 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -29,7 +29,7 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS - // windows has no support for openblas multi-thread +// windows has no support for openblas multi-thread #ifdef _WIN32 if (num_threads > 1) { num_threads = 1; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33f6df67a4..0d53f53a9e 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,7 +29,6 @@ if os.name != 'nt': ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy - class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -161,7 +160,8 @@ if os.name != 'nt': for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + set(cpt.to_text(var) + for var in self.persistable_vars), main.desc, cpt.to_text(loss_name) if loss_name else six.u(''), scope, local_scopes, exec_strategy, build_strategy, num_trainers, trainer_id) From fcbd5a12b802560f279e30086d03ef152f760ab5 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 15:23:05 +0800 Subject: [PATCH 005/113] add create_recordio_file_reader back --- python/paddle/fluid/layers/io.py | 118 +++++++++++++++---------------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index a9075045a2..8e18a6e784 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -347,72 +347,70 @@ def _copy_reader_create_op_(block, op): return new_op -if os.name != 'nt': - - @templatedoc(op_type='create_recordio_file_reader') - def open_recordio_file(filename, - shapes, - lod_levels, - dtypes, - pass_num=1, - for_parallel=True): - """ - ${comment} - - Args: - filename(${filename_type}): ${filename_comment}. - shapes(list): List of tuples which declaring data shapes. - lod_levels(${lod_levels_type}): ${lod_levels_comment}. - dtypes(list): List of strs which declaring data type. - pass_num(int): Number of passes to run. - for_parallel(Bool): Set it as True if you are going to run - subsequent operators in parallel. - - Returns: - ${out_comment}. - - Examples: - - >>> import paddle.fluid as fluid - >>> reader = fluid.layers.io.open_recordio_file( - >>> filename='./data.recordio', - >>> shapes=[(3,224,224), (1)], - >>> lod_levels=[0, 0], - >>> dtypes=['float32', 'int64']) - >>> # Via the reader, we can use 'read_file' layer to get data: - >>> image, label = fluid.layers.io.read_file(reader) - """ - dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] - shape_concat = [] - ranks = [] +@templatedoc(op_type='create_recordio_file_reader') +def open_recordio_file(filename, + shapes, + lod_levels, + dtypes, + pass_num=1, + for_parallel=True): + """ + ${comment} - for shape in shapes: - shape_concat.extend(shape) - ranks.append(len(shape)) + Args: + filename(${filename_type}): ${filename_comment}. + shapes(list): List of tuples which declaring data shapes. + lod_levels(${lod_levels_type}): ${lod_levels_comment}. + dtypes(list): List of strs which declaring data type. + pass_num(int): Number of passes to run. + for_parallel(Bool): Set it as True if you are going to run + subsequent operators in parallel. - var_name = unique_name('open_recordio_file') + Returns: + ${out_comment}. - startup_blk = default_startup_program().current_block() - startup_var = startup_blk.create_var(name=var_name) - startup_blk.append_op( - type='create_recordio_file_reader', - outputs={'Out': [startup_var]}, - attrs={ - 'shape_concat': shape_concat, - 'lod_levels': lod_levels, - 'filename': filename, - 'ranks': ranks - }) + Examples: - startup_var.desc.set_dtypes(dtypes) - startup_var.persistable = True - main_prog_var = _copy_reader_var_( - default_main_program().current_block(), startup_var) + >>> import paddle.fluid as fluid + >>> reader = fluid.layers.io.open_recordio_file( + >>> filename='./data.recordio', + >>> shapes=[(3,224,224), (1)], + >>> lod_levels=[0, 0], + >>> dtypes=['float32', 'int64']) + >>> # Via the reader, we can use 'read_file' layer to get data: + >>> image, label = fluid.layers.io.read_file(reader) + """ + dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] + shape_concat = [] + ranks = [] + + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) - if pass_num > 1: - main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) + var_name = unique_name('open_recordio_file') - return monkey_patch_reader_methods(main_prog_var) + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type='create_recordio_file_reader', + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'filename': filename, + 'ranks': ranks + }) + + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + main_prog_var = _copy_reader_var_( + default_main_program().current_block(), startup_var) + + if pass_num > 1: + main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) + + return monkey_patch_reader_methods(main_prog_var) def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): From 1dc1dd94d1c349c6bbed6fe826ae1d25a3983603 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 16:07:25 +0800 Subject: [PATCH 006/113] fix code style test=develop --- python/paddle/fluid/layers/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8e18a6e784..3f47053961 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -404,8 +404,8 @@ def open_recordio_file(filename, startup_var.desc.set_dtypes(dtypes) startup_var.persistable = True - main_prog_var = _copy_reader_var_( - default_main_program().current_block(), startup_var) + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) if pass_num > 1: main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) From 695e2aba5e8396ff0719da8516e38b1ef4782c05 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 19:33:50 +0800 Subject: [PATCH 007/113] fix the gtest.cmake on windows --- cmake/external/gtest.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index da539d52bd..943767fb17 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -51,7 +51,11 @@ IF(WITH_TESTING) CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_GMOCK=ON From b942f4760ab30f6a107c6cf944032c9dde143528 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 22:04:11 +0800 Subject: [PATCH 008/113] fix cc_test on windows --- cmake/generic.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index e21f89c7c5..111627a932 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -351,6 +351,9 @@ function(cc_test TARGET_NAME) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + if(WIN32) + target_link_libraries(${TARGET_NAME} shlwapi) + endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} From ef943bd6cd463b4e00bb18cc137334a6b78fca55 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 00:32:33 +0800 Subject: [PATCH 009/113] fix the win build test=develop --- paddle/fluid/operators/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index df2a3e7aa6..a2334c1499 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE) add_subdirectory(distributed_ops) endif() -if (NOT WIN32) - add_subdirectory(reader) -endif() +add_subdirectory(reader) if (NOT WIN32) add_subdirectory(nccl) @@ -49,9 +47,9 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions dynload_warpctc jit_kernel) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) From a395942c6a5246f1702e592e1f3d369caa3accc1 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 08:38:02 +0800 Subject: [PATCH 010/113] remove fused compile support on windows test=develop --- paddle/fluid/operators/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a2334c1499..40246d05e9 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -11,7 +11,9 @@ add_subdirectory(controlflow) add_subdirectory(csp) add_subdirectory(detection) add_subdirectory(elementwise) -add_subdirectory(fused) +if(NOT WIN32) + add_subdirectory(fused) +endif(NOT WIN32) add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) From c75dc885b58000b018414ab442097ee515244b9c Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 14:36:56 +0800 Subject: [PATCH 011/113] add the jit support test=develop --- paddle/fluid/operators/CMakeLists.txt | 7 ++-- paddle/fluid/operators/math/CMakeLists.txt | 36 +++++++++---------- .../math/detail/activation_functions.h | 7 ++++ paddle/fluid/operators/math/jit_code.cc | 5 +++ 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 40246d05e9..10748b0cda 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -11,9 +11,7 @@ add_subdirectory(controlflow) add_subdirectory(csp) add_subdirectory(detection) add_subdirectory(elementwise) -if(NOT WIN32) - add_subdirectory(fused) -endif(NOT WIN32) +add_subdirectory(fused) add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) @@ -50,8 +48,9 @@ endif() set(COMMON_OP_DEPS "") set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code gru_compute activation_functions jit_kernel) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions dynload_warpctc jit_kernel) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch dynload_warpctc) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 4cd014cbad..08c8dbbfe8 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,6 +1,4 @@ -if (NOT WIN32) - add_subdirectory(detail) -endif(NOT WIN32) +add_subdirectory(detail) function(math_library TARGET) # math_library is a function to create math library. @@ -43,10 +41,8 @@ math_library(depthwise_conv) math_library(im2col) math_library(sampler) -if (NOT WIN32) # windows do not support avx functions yet. - math_library(gru_compute DEPS activation_functions math_function) - math_library(lstm_compute DEPS activation_functions) -endif (NOT WIN32) +math_library(gru_compute DEPS activation_functions math_function) +math_library(lstm_compute DEPS activation_functions) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) @@ -58,9 +54,9 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) -if (NOT WIN32) - math_library(matrix_bit_code) -endif (NOT WIN32) + +math_library(matrix_bit_code) + math_library(unpooling) math_library(vol2col) @@ -76,13 +72,13 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -if (NOT WIN32) - set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) - set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) - if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) - endif() - cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) - cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) -endif (NOT WIN32) + +set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) +if(WITH_XBYAK) + list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) + list(APPEND JIT_KERNEL_DEPS xbyak) +endif() +cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) + diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index b127fbe8c8..42fb45a8a5 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -15,6 +15,13 @@ limitations under the License. */ #pragma once #include #include + +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX__2 +#endif + + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e3b600d442..0f4b8f65ac 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -118,7 +118,12 @@ void VXXJitCode::generate() { ret(); } +#ifdef _WIN32 +#define ALIGN32 +#else #define ALIGN32 __attribute__((aligned(32))) +#endif + #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 From 5e46c98362897fbf043eb8387618740fbbd6fd07 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 14:41:29 +0800 Subject: [PATCH 012/113] add the jit support, test=develop --- paddle/fluid/operators/math/detail/activation_functions.h | 6 ------ paddle/fluid/platform/cpu_info.h | 5 +++++ paddle/fluid/platform/enforce.h | 5 +++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 42fb45a8a5..2b3d38d95a 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -16,12 +16,6 @@ limitations under the License. */ #include #include -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX__2 -#endif - - #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 6810a1651a..1b4840d9a1 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a251bfcd99..c03bbd59ac 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ From 928efeed46132df27d1c389be046bdc31c9451f4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 14:41:29 +0800 Subject: [PATCH 013/113] add the jit support, test=develop --- cmake/operators.cmake | 5 +++-- paddle/fluid/operators/math/detail/activation_functions.h | 6 ------ paddle/fluid/platform/cpu_info.h | 5 +++++ paddle/fluid/platform/enforce.h | 5 +++++ 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index c9d0f80da2..5636342ef7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,8 +84,9 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" - "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" +# "hierarchical_sigmoid_op" "cumsum_op" +# "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 42fb45a8a5..2b3d38d95a 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -16,12 +16,6 @@ limitations under the License. */ #include #include -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX__2 -#endif - - #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 6810a1651a..1b4840d9a1 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a251bfcd99..c03bbd59ac 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ From a3e952f41d9081b8d0f69128f7d758fd95f97f96 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sun, 18 Nov 2018 12:19:05 +0800 Subject: [PATCH 014/113] add the jit back fix compile error on windows --- CMakeLists.txt | 5 + cmake/operators.cmake | 5 +- cmake/simd.cmake | 25 +- paddle/fluid/operators/CMakeLists.txt | 9 +- .../fluid/operators/hierarchical_sigmoid_op.h | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 35 +- paddle/fluid/operators/math/matrix_bit_code.h | 3 +- python/paddle/fluid/layers/nn.py | 374 +++++++++--------- python/paddle/fluid/layers/ops.py | 41 +- 9 files changed, 241 insertions(+), 258 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3bc60d57b..c2804e234d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,6 +130,11 @@ if (APPLE OR WIN32) "Disable MKL for building on mac and windows" FORCE) endif() +if (WIN32) + set(WITH_AVX OFF CACHE STRING + "Disable AVX when compiling for Windows" FORCE) +endif() + set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index c9d0f80da2..5e8b95b3e2 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,9 +84,8 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" - "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" - "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" + "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 566dc75fda..4926fb9913 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -70,17 +70,20 @@ int main() return 0; }" AVX_FOUND) -# Check AVX 2 -set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; -}" AVX2_FOUND) +# disable AVX2 by default on windows +if(NOT WIN32) + # Check AVX 2 + set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) + set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; + }" AVX2_FOUND) +endif(NOT WIN32) # Check AVX512F set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index df2a3e7aa6..284bf5dc9e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE) add_subdirectory(distributed_ops) endif() -if (NOT WIN32) - add_subdirectory(reader) -endif() +add_subdirectory(reader) if (NOT WIN32) add_subdirectory(nccl) @@ -49,9 +47,10 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 64096a717b..79980cda53 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { auto pre_out_mat = EigenMatrix::From(*pre_out); auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); auto out_grad_mat = EigenMatrix::From(*out_grad); - Eigen::array bcast({{1, static_cast(pre_out_grad.dims()[1])}}); + Eigen::array bcast{1, static_cast(pre_out_grad.dims()[1])}; // softrelu derivative pre_out_grad_mat.device(place) = diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 4cd014cbad..e9397d552d 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,6 +1,4 @@ -if (NOT WIN32) - add_subdirectory(detail) -endif(NOT WIN32) +add_subdirectory(detail) function(math_library TARGET) # math_library is a function to create math library. @@ -43,10 +41,8 @@ math_library(depthwise_conv) math_library(im2col) math_library(sampler) -if (NOT WIN32) # windows do not support avx functions yet. - math_library(gru_compute DEPS activation_functions math_function) - math_library(lstm_compute DEPS activation_functions) -endif (NOT WIN32) +math_library(gru_compute DEPS activation_functions math_function) +math_library(lstm_compute DEPS activation_functions) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) @@ -58,9 +54,9 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) -if (NOT WIN32) - math_library(matrix_bit_code) -endif (NOT WIN32) + +math_library(matrix_bit_code) + math_library(unpooling) math_library(vol2col) @@ -76,13 +72,12 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -if (NOT WIN32) - set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) - set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) - if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) - endif() - cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) - cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) -endif (NOT WIN32) + +set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) +if(WITH_XBYAK) + list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) + list(APPEND JIT_KERNEL_DEPS xbyak) +endif() +cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 07854c8358..c329b8b611 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) { : (std::is_same::value // NOLINT ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); - +} #else // windows don't have built-in clz, ctz function template @@ -92,7 +92,6 @@ inline int clz(const T& value) { inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } #endif // !_WIN32 -} struct SimpleCode { SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {} diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9465d97564..a2bab64384 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -170,12 +170,6 @@ __all__ = [ 'bilinear_tensor_product', ] -# To avoid the api checker complains -if os.name == 'nt': - __all__.remove('dynamic_lstm') - __all__.remove('crf_decoding') - __all__.remove('roi_pool') - def fc(input, size, @@ -349,128 +343,126 @@ def embedding(input, return tmp -if os.name != 'nt': +@templatedoc(op_type="lstm") +def dynamic_lstm(input, + size, + h_0=None, + c_0=None, + param_attr=None, + bias_attr=None, + use_peepholes=True, + is_reverse=False, + gate_activation='sigmoid', + cell_activation='tanh', + candidate_activation='tanh', + dtype='float32', + name=None): + """ + ${comment} - @templatedoc(op_type="lstm") - def dynamic_lstm(input, - size, - h_0=None, - c_0=None, - param_attr=None, - bias_attr=None, - use_peepholes=True, - is_reverse=False, - gate_activation='sigmoid', - cell_activation='tanh', - candidate_activation='tanh', - dtype='float32', - name=None): - """ - ${comment} - - Args: - input (Variable): ${input_comment} - size (int): 4 * hidden size. - h_0(Variable): The initial hidden state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size and D is the hidden size. - c_0(Variable): The initial cell state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size. `h_0` and `c_0` can be NULL but only at the same time. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weights. - - - Weights = {:math:`W_{ch}, W_{ih}, \ - W_{fh}, W_{oh}`} - - The shape is (D x 4D), where D is the hidden - size. - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The bias attribute for the learnable bias - weights, which contains two parts, input-hidden - bias weights and peephole connections weights if - setting `use_peepholes` to `True`. - - 1. `use_peepholes = False` - - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). - 2. `use_peepholes = True` - - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - use_peepholes (bool): ${use_peepholes_comment} - is_reverse (bool): ${is_reverse_comment} - gate_activation (str): ${gate_activation_comment} - cell_activation (str): ${cell_activation_comment} - candidate_activation (str): ${candidate_activation_comment} - dtype (str): Data type. Choices = ["float32", "float64"], default "float32". - name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - - Returns: - tuple: The hidden state, and cell state of LSTM. The shape of both \ - is (T x D), and lod is the same with the `input`. - - Examples: - .. code-block:: python - - hidden_dim = 512 - forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, - bias_attr=False) - forward, _ = fluid.layers.dynamic_lstm( - input=forward_proj, size=hidden_dim * 4, use_peepholes=False) - """ - assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." - helper = LayerHelper('lstm', **locals()) - size = size // 4 - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) - bias_size = [1, 7 * size] - if not use_peepholes: - bias_size[1] = 4 * size - bias = helper.create_parameter( - attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + Args: + input (Variable): ${input_comment} + size (int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weights. - hidden = helper.create_variable_for_type_inference(dtype) - cell = helper.create_variable_for_type_inference(dtype) - batch_gate = helper.create_variable_for_type_inference(dtype) - batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) - inputs = {'Input': input, 'Weight': weight, 'Bias': bias} - batch_size = input.shape[0] - if h_0: - assert h_0.shape == (batch_size, size), \ - 'The shape of h0 should be (batch_size, %d)' % size - inputs['H0'] = h_0 - if c_0: - assert c_0.shape == (batch_size, size), \ - 'The shape of c0 should be (batch_size, %d)' % size - inputs['C0'] = c_0 + - Weights = {:math:`W_{ch}, W_{ih}, \ + W_{fh}, W_{oh}`} + - The shape is (D x 4D), where D is the hidden + size. - helper.append_op( - type='lstm', - inputs=inputs, - outputs={ - 'Hidden': hidden, - 'Cell': cell, - 'BatchGate': batch_gate, - 'BatchCellPreAct': batch_cell_pre_act - }, - attrs={ - 'use_peepholes': use_peepholes, - 'is_reverse': is_reverse, - 'gate_activation': gate_activation, - 'cell_activation': cell_activation, - 'candidate_activation': candidate_activation - }) - return hidden, cell + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The bias attribute for the learnable bias + weights, which contains two parts, input-hidden + bias weights and peephole connections weights if + setting `use_peepholes` to `True`. + + 1. `use_peepholes = False` + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). + 2. `use_peepholes = True` + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + W_{fc}, W_{oc}`}. + - The shape is (1 x 7D). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. + use_peepholes (bool): ${use_peepholes_comment} + is_reverse (bool): ${is_reverse_comment} + gate_activation (str): ${gate_activation_comment} + cell_activation (str): ${cell_activation_comment} + candidate_activation (str): ${candidate_activation_comment} + dtype (str): Data type. Choices = ["float32", "float64"], default "float32". + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + tuple: The hidden state, and cell state of LSTM. The shape of both \ + is (T x D), and lod is the same with the `input`. + + Examples: + .. code-block:: python + + hidden_dim = 512 + forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=forward_proj, size=hidden_dim * 4, use_peepholes=False) + """ + assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." + helper = LayerHelper('lstm', **locals()) + size = size // 4 + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) + bias_size = [1, 7 * size] + if not use_peepholes: + bias_size[1] = 4 * size + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + + hidden = helper.create_variable_for_type_inference(dtype) + cell = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 + + helper.append_op( + type='lstm', + inputs=inputs, + outputs={ + 'Hidden': hidden, + 'Cell': cell, + 'BatchGate': batch_gate, + 'BatchCellPreAct': batch_cell_pre_act + }, + attrs={ + 'use_peepholes': use_peepholes, + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'cell_activation': cell_activation, + 'candidate_activation': candidate_activation + }) + return hidden, cell def dynamic_lstmp(input, @@ -969,43 +961,39 @@ def linear_chain_crf(input, label, param_attr=None): return log_likelihood -if os.name != 'nt': - - @templatedoc() - def crf_decoding(input, param_attr, label=None): - """ - ${comment} +@templatedoc() +def crf_decoding(input, param_attr, label=None): + """ + ${comment} - Args: - input(${emission_type}): ${emission_comment} + Args: + input(${emission_type}): ${emission_comment} - param_attr(ParamAttr): The parameter attribute for training. + param_attr(ParamAttr): The parameter attribute for training. - label(${label_type}): ${label_comment} + label(${label_type}): ${label_comment} - Returns: - Variable: ${viterbi_path_comment} + Returns: + Variable: ${viterbi_path_comment} - Examples: - .. code-block:: python + Examples: + .. code-block:: python - crf_decode = layers.crf_decoding( - input=hidden, param_attr=ParamAttr(name="crfw")) - """ - helper = LayerHelper('crf_decoding', **locals()) - transition = helper.get_parameter(param_attr.name) - viterbi_path = helper.create_variable_for_type_inference( - dtype=helper.input_dtype()) - helper.append_op( - type='crf_decoding', - inputs={ - "Emission": [input], + crf_decode = layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) + """ + helper = LayerHelper('crf_decoding', **locals()) + transition = helper.get_parameter(param_attr.name) + viterbi_path = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + helper.append_op( + type='crf_decoding', + inputs={"Emission": [input], "Transition": transition, - "Label": label - }, - outputs={"ViterbiPath": [viterbi_path]}) + "Label": label}, + outputs={"ViterbiPath": [viterbi_path]}) - return viterbi_path + return viterbi_path @templatedoc() @@ -5599,48 +5587,42 @@ def label_smooth(label, return smooth_label -if os.name != 'nt': - - @templatedoc() - def roi_pool(input, - rois, - pooled_height=1, - pooled_width=1, - spatial_scale=1.0): - """ - ${comment} - - Args: - input (Variable): ${x_comment} - rois (Variable): ROIs (Regions of Interest) to pool over. - pooled_height (integer): ${pooled_height_comment} Default: 1 - pooled_width (integer): ${pooled_width_comment} Default: 1 - spatial_scale (float): ${spatial_scale_comment} Default: 1.0 - - Returns: - Variable: ${out_comment}. - - Examples: - .. code-block:: python - - pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) - """ - helper = LayerHelper('roi_pool', **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - argmaxes = helper.create_variable_for_type_inference(dtype='int32') - helper.append_op( - type="roi_pool", - inputs={"X": input, - "ROIs": rois}, - outputs={"Out": pool_out, - "Argmax": argmaxes}, - attrs={ - "pooled_height": pooled_height, - "pooled_width": pooled_width, - "spatial_scale": spatial_scale - }) - return pool_out +@templatedoc() +def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + + Returns: + Variable: ${out_comment}. + + Examples: + .. code-block:: python + + pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) + """ + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type="roi_pool", + inputs={"X": input, + "ROIs": rois}, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out @templatedoc() diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 66eb1229aa..6c18af7283 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -100,26 +100,27 @@ Examples: >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) """ -if os.name != 'nt': - __all__ += ['cumsum'] - - _cum_sum_ = generate_layer_fn('cumsum') - - def cumsum(x, axis=None, exclusive=None, reverse=None): - locals_var = locals().keys() - kwargs = dict() - for name in locals_var: - val = locals()[name] - if val is not None: - kwargs[name] = val - return _cum_sum_(**kwargs) - - cumsum.__doc__ = _cum_sum_.__doc__ + """ - Examples: - - >>> data = fluid.layers.data(name="input", shape=[32, 784]) - >>> result = fluid.layers.cumsum(data, axis=0) - """ +__all__ += ['cumsum'] + +_cum_sum_ = generate_layer_fn('cumsum') + + +def cumsum(x, axis=None, exclusive=None, reverse=None): + locals_var = locals().keys() + kwargs = dict() + for name in locals_var: + val = locals()[name] + if val is not None: + kwargs[name] = val + return _cum_sum_(**kwargs) + + +cumsum.__doc__ = _cum_sum_.__doc__ + """ +Examples: + + >>> data = fluid.layers.data(name="input", shape=[32, 784]) + >>> result = fluid.layers.cumsum(data, axis=0) +""" __all__ += ['thresholded_relu'] From a1fa18542f308cc6c8a495f36ef72428b03ee704 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sun, 18 Nov 2018 12:56:03 +0800 Subject: [PATCH 015/113] rollback test=develop --- paddle/fluid/operators/math/jit_code.cc | 5 ----- paddle/fluid/platform/cpu_info.h | 5 ----- paddle/fluid/platform/enforce.h | 5 ----- 3 files changed, 15 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 0f4b8f65ac..e3b600d442 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -118,12 +118,7 @@ void VXXJitCode::generate() { ret(); } -#ifdef _WIN32 -#define ALIGN32 -#else #define ALIGN32 __attribute__((aligned(32))) -#endif - #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 1b4840d9a1..6810a1651a 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -14,11 +14,6 @@ limitations under the License. */ #pragma once -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX2__ -#endif - #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c03bbd59ac..a251bfcd99 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,11 +14,6 @@ limitations under the License. */ #pragma once -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX2__ -#endif - #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ From c59d3e83bc98d2dd3a8d9370b368c7d12c97d314 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sun, 18 Nov 2018 18:06:09 +0800 Subject: [PATCH 016/113] test case fix --- paddle/fluid/platform/enforce.h | 64 ++++++++------------------------- 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a251bfcd99..3643d2ad15 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -127,14 +127,14 @@ struct EOFException : public std::exception { #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) #else // there is no equivalent intrinsics in msvc. -#define UNLIKELY(condition) (condition == 0) +#define UNLIKELY(condition) (condition) #endif #if !defined(_WIN32) #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) (condition != 0) +#define LIKELY(condition) !(condition) #endif template @@ -248,7 +248,6 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } -#if !defined(_WIN32) #define PADDLE_THROW(...) \ do { \ throw ::paddle::platform::EnforceNotMet( \ @@ -272,17 +271,6 @@ inline void throw_on_error(T e) { #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG -#else // !_WIN32 -// disable enforce, caused by the varardic macro exception error -#define PADDLE_THROW(x) \ - do { \ - throw std::make_exception_ptr( \ - std::runtime_error("Windows disable the enforce.")); \ - } while (false) - -#define PADDLE_ENFORCE(x, ...) x -#endif // !_WIN32 - #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ @@ -302,20 +290,6 @@ inline void throw_on_error(T e) { * extra messages is also supported, for example: * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) */ -#if !defined(_WIN32) -#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) -#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) -#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) -#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) -#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) -#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) - #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ do { \ if (UNLIKELY(nullptr == (__VAL))) { \ @@ -335,27 +309,19 @@ inline void throw_on_error(T e) { paddle::string::Sprintf("" __VA_ARGS__)); \ } \ } while (0) -#else -#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1)) -#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1)) -#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1)) -#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1)) -#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1)) -#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1)) - -#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ - do { \ - if (!((__VAL0)__CMP(__VAL1))) { \ - PADDLE_THROW("Windows disable the enforce. Enforce failed."); \ - } \ - } while (0) -#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \ - do { \ - if (nullptr == (__VAL1)) { \ - PADDLE_THROW("Windows disable the enforce. Enforce failed"); \ - } \ - } while (0) -#endif // !_WIN32 + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) } // namespace platform } // namespace paddle From 7d51a0e887c121b292a217e2ef3898b5c619b48a Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:07:45 +0800 Subject: [PATCH 017/113] disable DSO by default on windows --- CMakeLists.txt | 2 ++ paddle/fluid/operators/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2804e234d..0b42e60e17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,6 +133,8 @@ endif() if (WIN32) set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling for Windows" FORCE) + set(WITH_DSO OFF CACHE STRING + "Disable DSO when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 284bf5dc9e..73f44f3b67 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -35,7 +35,7 @@ endif() register_operators(EXCLUDES warpctc_op) # warpctc_cudnn need cudnn 7 above -if (WITH_GPU) +if (WITH_GPU AND NOT WIN32) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() From 1aff40a4c600d88fffc9117fc37d8feb0e7050e4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:32:50 +0800 Subject: [PATCH 018/113] exclude warpctc_op on windows --- paddle/fluid/operators/CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 73f44f3b67..9b1b272292 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -32,7 +32,9 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -register_operators(EXCLUDES warpctc_op) +if (NOT WIN32) + register_operators(EXCLUDES warpctc_op) +endif() # warpctc_cudnn need cudnn 7 above if (WITH_GPU AND NOT WIN32) @@ -47,10 +49,10 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) From 8cf63475b096e0ce1f8421d644897fd294ec9c18 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:53:16 +0800 Subject: [PATCH 019/113] exclude the dynload_warpctc out on windows test=develop --- paddle/fluid/operators/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9b1b272292..041de46ff5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -49,11 +49,12 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) endif() From 449406434ec680dff564847cad0d453590282e99 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:58:23 +0800 Subject: [PATCH 020/113] fix the scripts error test=develop --- paddle/fluid/operators/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 041de46ff5..60a42cf568 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -53,7 +53,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) From cc319f64cbd2b2cccb0fe4e9117c1517927ba515 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 11:15:12 +0800 Subject: [PATCH 021/113] disable avx on windows by default test=develop --- cmake/simd.cmake | 54 ++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 4926fb9913..86096d4fea 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,21 +57,21 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) - -# disable AVX2 by default on windows +# disable AVX by default on windows if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) + # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) @@ -83,20 +83,20 @@ if(NOT WIN32) __m256i result = _mm256_abs_epi32 (a); return 0; }" AVX2_FOUND) -endif(NOT WIN32) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) From 4a6769da84ac9f8a5dafbc0b9a00ef70944a2395 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 11:39:56 +0800 Subject: [PATCH 022/113] re-organize the cmake file --- cmake/simd.cmake | 54 +++++++++++++-------------- paddle/fluid/operators/CMakeLists.txt | 5 ++- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 4926fb9913..86096d4fea 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,21 +57,21 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) - -# disable AVX2 by default on windows +# disable AVX by default on windows if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) + # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) @@ -83,20 +83,20 @@ if(NOT WIN32) __m256i result = _mm256_abs_epi32 (a); return 0; }" AVX2_FOUND) -endif(NOT WIN32) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9b1b272292..60a42cf568 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -49,11 +49,12 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) endif() From 6e23d6a2d7e918d18e20380f9ac2192a7aaa91c8 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 13:46:21 +0800 Subject: [PATCH 023/113] disable mkl on windows by default --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b42e60e17..d9b797f3d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,6 +135,8 @@ if (WIN32) "Disable AVX when compiling for Windows" FORCE) set(WITH_DSO OFF CACHE STRING "Disable DSO when compiling for Windows" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING From 8443961a4f8b09ca1cfe632633ef21df87f5788a Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 16:55:49 +0800 Subject: [PATCH 024/113] add warp_ctc back --- paddle/fluid/operators/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 60a42cf568..412ab66709 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -32,9 +32,7 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (NOT WIN32) - register_operators(EXCLUDES warpctc_op) -endif() +register_operators(EXCLUDES warpctc_op) # warpctc_cudnn need cudnn 7 above if (WITH_GPU AND NOT WIN32) From a43bc612ad2590eeab42196b0adf87288f1c41f4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 17:08:06 +0800 Subject: [PATCH 025/113] fix the dependency --- paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 6611e2e4b3..b6811f9183 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1 +1 @@ -nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce) +nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context) From 81f750a88c354a7f34ee6732a152a87b0ee25d2f Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 17:10:38 +0800 Subject: [PATCH 026/113] fix the dependency --- paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 6611e2e4b3..b6811f9183 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1 +1 @@ -nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce) +nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context) From 16bc8f2a755438cac5a279f27b082dbaf0e3e3a0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 19 Nov 2018 21:52:59 +0800 Subject: [PATCH 027/113] Add debug info --- .dockerignore | 1 + Dockerfile | 86 +++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/.dockerignore b/.dockerignore index 2b2e74053d..49adfe4f0a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ *.DS_Store build/ +build* *.user .vscode .idea diff --git a/Dockerfile b/Dockerfile index c8b9eed6d6..b36102175c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,46 +71,46 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python - -#For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker - -COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt - -# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use -# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] - - -# Install woboq_codebrowser to /woboq -RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - (cd /woboq \ - cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - -DCMAKE_BUILD_TYPE=Release . \ - make) - -# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service -RUN mkdir /var/run/sshd -RUN echo 'root:root' | chpasswd -RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config -EXPOSE 22 +# RUN pip3 install -U wheel && \ + # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + # easy_install -U pip && \ + # pip install -U pip setuptools wheel && \ + # pip install -U docopt PyYAML sphinx==1.5.6 && \ + # pip install sphinx-rtd-theme==0.1.9 recommonmark + +# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + # pip3 install opencv-python && \ + # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + # pip install opencv-python + +# #For docstring checker +# RUN pip3 install pylint pytest astroid isort +# RUN pip install pylint pytest astroid isort LinkChecker + +# COPY ./python/requirements.txt /root/ +# RUN pip3 install -r /root/requirements.txt +# RUN pip install -r /root/requirements.txt + +# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +# RUN apt-get install -y libssl-dev libffi-dev +# RUN pip3 install certifi urllib3[secure] +# RUN pip install certifi urllib3[secure] + + +# # Install woboq_codebrowser to /woboq +# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + # (cd /woboq \ + # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + # -DCMAKE_BUILD_TYPE=Release . \ + # make) + +# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +# RUN mkdir /var/run/sshd +# RUN echo 'root:root' | chpasswd +# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +# EXPOSE 22 From 3f73c0a70d641b2b84b4764ee55f3f942fb2c6da Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 10:26:59 +0800 Subject: [PATCH 028/113] fix the build issue on windows --- paddle/fluid/memory/allocation/cpu_allocator.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 9e0044c47a..165f11cd3b 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -15,6 +15,11 @@ #pragma once #include "paddle/fluid/memory/allocation/allocator.h" +#ifdef _WIN32 +#define posix_memalign_free _aligned_free +#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#endif + namespace paddle { namespace memory { namespace allocation { From 301ed153231f0e0f6066663c1adc572af4907c97 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 13:36:10 +0800 Subject: [PATCH 029/113] remove unsupported flag on windows --- python/paddle/fluid/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a7dfc6e9e3..091697aaa5 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,7 +116,7 @@ def __bootstrap__(): 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb', + "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode' ] if os.name != 'nt': From 935387f3fc4c36a13443443cb820868b65a3c667 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 14:40:19 +0800 Subject: [PATCH 030/113] code style --- python/paddle/fluid/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a28f7b2d1..6a4a5e098f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,9 +116,8 @@ def __bootstrap__(): 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'eager_delete_tensor_gb', - 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir' + "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', + 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] if os.name != 'nt': read_env_flags.append('warpctc_dir') From afeadf58f9ce90d4dd05f1eb1e4936cb83bc0cde Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 14:59:20 +0800 Subject: [PATCH 031/113] code style test=develop --- paddle/fluid/memory/allocation/cpu_allocator.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 165f11cd3b..26d3643f4e 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -17,7 +17,8 @@ #ifdef _WIN32 #define posix_memalign_free _aligned_free -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) #endif namespace paddle { From 0e1b426c839d8c91952b7d18139d3681beaa726f Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Tue, 20 Nov 2018 07:02:12 +0000 Subject: [PATCH 032/113] refine prelu api doc, test=develop --- python/paddle/fluid/layers/nn.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e308..5749bcc54a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6900,18 +6900,18 @@ def prelu(x, mode, param_attr=None, name=None): """ Equation: - y = \max(0, x) + alpha \min(0, x) + y = \max(0, x) + alpha * \min(0, x) Args: x (Variable): The input tensor. param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). - mode (string): The mode for weight sharing - all: all elements share same weight - channel:elements in a channel share same weight - element:each element has a weight + weight (alpha). + mode (string): The mode for weight sharing. It supports all, channel + and element. all: all elements share same weight + channel:elements in a channel share same weight + element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -6921,8 +6921,8 @@ def prelu(x, mode, param_attr=None, name=None): .. code-block:: python x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") - mode = 'channel' - output = fluid.layers.prelu(x,mode) + mode = 'channel' + output = fluid.layers.prelu(x,mode) """ helper = LayerHelper('prelu', **locals()) if mode not in ['all', 'channel', 'element']: From c2cfb03a7277a92297b4617cb5c778bb495a998b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 20 Nov 2018 08:50:24 +0000 Subject: [PATCH 033/113] add lstm jitcode --- paddle/fluid/operators/math/jit_code.cc | 49 +++++++++ paddle/fluid/operators/math/jit_code.h | 102 ++++++++++++++++-- paddle/fluid/operators/math/jit_kernel.h | 15 ++- paddle/fluid/operators/math/jit_kernel_impl.h | 49 +++++++++ 4 files changed, 198 insertions(+), 17 deletions(-) create mode 100644 paddle/fluid/operators/math/jit_kernel_impl.h diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e484e9a3c7..418c843362 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_code.h" +#include // offsetof #include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me namespace paddle { @@ -210,6 +211,54 @@ void VActJitCode::generate() { ret(); } +bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } + +void LSTMJitCode::generate() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + + int offset = 0; + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + // c + vmovups(ymm_src, ptr[reg_ptr_gates + offset]); + act(ymm_c, ymm_src, act_cand_); + // i + vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]); + act(ymm_i, ymm_src, act_gate_); + vmulps(ymm_c, ymm_c, ymm_i); + if (first_) { + // f + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]); + act(ymm_f, ymm_src, act_gate_); + vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); + vmulps(ymm_f, ymm_f, ymm_i); + vaddps(ymm_f, ymm_f, ymm_c); + } + /* H_t = act_cell(C_t) * ogated */ + ymm_t ymm_ct = first_ ? ymm_c : ymm_f; + ymm_t ymm_o = first_ ? ymm_f : ymm_c; + ymm_t ymm_tmp = ymm_i; + act(ymm_tmp, ymm_ct, act_cell_); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]); + act(ymm_o, ymm_src, act_gate_); + vmulps(ymm_o, ymm_tmp, ymm_o); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); + + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + + ret(); +} + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 65f83ff484..938b5525c1 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/jit_gen.h" +#include "paddle/fluid/operators/math/jit_kernel_impl.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -46,14 +47,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -// TODO(TJ): move these to some proper place -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - #define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f @@ -322,6 +315,99 @@ class VActJitCode : public JitCode { ymm_t ymm_dst = ymm_t(1); }; +class LSTMJitCode : public VActJitCode { + public: + const char* name() const override { + std::string base = "LSTMJitCode"; + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + if (first_) { + base += "_C1H1"; + } + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + AddTypeStr(act_cell_); + return base.c_str(); + } + + explicit LSTMJitCode(int d, bool first, operand_type act_gate, + operand_type act_cand, operand_type act_cell, + size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : VActJitCode(d, act_gate, code_size, code_ptr), + num_(d), + first_(first), + act_gate_(act_gate), + act_cand_(act_cand), + act_cell_(act_cell) {} + static bool init(int d); + void generate() override; + + protected: + int num_; + bool first_; + operand_type act_gate_; + operand_type act_cand_; + operand_type act_cell_; + reg64_t param1{abi_param1}; + + xmm_t xmm_src = xmm_t(0); + xmm_t xmm_c = xmm_t(1); + xmm_t xmm_i = xmm_t(2); + xmm_t xmm_f = xmm_t(3); + + ymm_t ymm_src = ymm_t(0); + ymm_t ymm_c = ymm_t(1); + ymm_t ymm_i = ymm_t(2); + ymm_t ymm_f = ymm_t(3); + + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 15 + JMM zero = JMM(15); + if (type_ == operand_type::relu) { + vxorps(zero, zero, zero); + } + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, zero); + break; + case operand_type::exp: + exp_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::identity: + break; + default: + // throw error + break; + } + } +}; + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 7e163c1349..b5e54fcc1b 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -17,6 +17,7 @@ limitations under the License. */ #include // for shared_ptr #include #include +#include "paddle/fluid/operators/math/jit_kernel_impl.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/macros.h" @@ -26,14 +27,7 @@ namespace operators { namespace math { namespace jitkernel { -// TODO(TJ): move these to some proper place -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - +// TODO(TJ): remove me typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; class Kernel { @@ -124,10 +118,13 @@ class LSTMKernel : public Kernel { const T *wp_data = nullptr, T *checked = nullptr) const = 0; - // compute c1 and h1 without c0 or h0 virtual void ComputeC1H1(T *gates, T *ct, T *ht, /* below only used in peephole*/ const T *wp_data = nullptr) const = 0; + + // void (*ComputeCtHt)(lstm_t *); + // // compute c1 and h1 without c0 or h0 + // void (*ComputeC1H1)(lstm_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h new file mode 100644 index 0000000000..fcb6a7c097 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +typedef struct { + void* gates; // gates: W_ch, W_ih, W_fh, W_oh + const void* ct_1; + void* ct; + void* ht; + /* below only used in peephole*/ + const void* wp_data{nullptr}; + void* checked{nullptr}; +} lstm_t; + +typedef struct { + int d; + std::string act_gate, act_cand, act_cell; +} lstm_attr_t; + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle From 5d6b370a4968fc4bc7dea369ee588ebec0b8f660 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 20:17:16 +0800 Subject: [PATCH 034/113] fix issue --- paddle/fluid/platform/enforce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3643d2ad15..31309738a5 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -134,7 +134,7 @@ struct EOFException : public std::exception { #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) !(condition) +#define LIKELY(condition) (condition) #endif template From ce31deb7e938270249b719bce93ef6d8baf5c0c4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 20 Nov 2018 12:37:43 +0000 Subject: [PATCH 035/113] refine refer code and add lstm refer code test=develop --- .../fluid/operators/math/jit_kernel_blas.cc | 65 +------ paddle/fluid/operators/math/jit_kernel_exp.cc | 40 +--- paddle/fluid/operators/math/jit_kernel_impl.h | 6 +- .../fluid/operators/math/jit_kernel_refer.h | 171 ++++++++++++++++++ .../fluid/operators/math/jit_kernel_test.cc | 139 +++----------- 5 files changed, 220 insertions(+), 201 deletions(-) create mode 100644 paddle/fluid/operators/math/jit_kernel_refer.h diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 36a50f2043..90b7029371 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_XBYAK @@ -31,49 +32,6 @@ namespace math { namespace jitkernel { namespace jit = platform::jit; -template -void VMulRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template -void VAddRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - -template -void VAddReluRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} - -template -void VScalRefer(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] * x[i]; - } -} - -template -void VAddBiasRefer(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] + x[i]; - } -} - -template -void VReluRefer(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - #ifdef PADDLE_WITH_MKLML template void VMulMKL(const T* x, const T* y, T* z, int n); @@ -109,7 +67,7 @@ void VScalMKL(const float* a, const float* x, float* y, int n) { if (x == y) { platform::dynload::cblas_sscal(n, *a, y, 1); } else { - VScalRefer(a, x, y, n); + refer::VScal(a, x, y, n); } } @@ -118,7 +76,7 @@ void VScalMKL(const double* a, const double* x, double* y, int n) { if (x == y) { platform::dynload::cblas_dscal(n, *a, y, 1); } else { - VScalRefer(a, x, y, n); + refer::VScal(a, x, y, n); } } @@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel { return; } #endif - this->Compute = VMulRefer; + this->Compute = refer::VMul; } #ifdef PADDLE_WITH_XBYAK @@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel { return; } #endif - this->Compute = VAddRefer; + this->Compute = refer::VAdd; } #ifdef PADDLE_WITH_XBYAK @@ -242,7 +200,7 @@ class VAddReluKernelImpl : public VAddReluKernel { return; } #endif - this->Compute = VAddReluRefer; + this->Compute = refer::VAddRelu; } #ifdef PADDLE_WITH_XBYAK @@ -280,7 +238,7 @@ class VScalKernelImpl : public VScalKernel { return; } #endif - this->Compute = VScalRefer; + this->Compute = refer::VScal; } #ifdef PADDLE_WITH_XBYAK @@ -324,7 +282,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel { } #endif - this->Compute = VAddBiasRefer; + this->Compute = refer::VAddBias; } #ifdef PADDLE_WITH_XBYAK @@ -358,7 +316,7 @@ class VReluKernelImpl : public VReluKernel { } #endif - this->Compute = VReluRefer; + this->Compute = refer::VRelu; } #ifdef PADDLE_WITH_XBYAK @@ -374,16 +332,13 @@ bool VReluKernelImpl::useJIT(int d) { } #endif -template -inline void VIdentityRefer(const T* x, T* y, int n) {} - /* An empty JitKernel */ template class VIdentityKernelImpl : public VIdentityKernel { public: JITKERNEL_DECLARE_STATIC_FUNC; explicit VIdentityKernelImpl(int d) : VIdentityKernel() { - this->Compute = VIdentityRefer; + this->Compute = refer::VIdentity; } }; diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index f26815300d..1fe7d66c75 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" -#include // for exp #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #ifdef PADDLE_WITH_XBYAK #include "paddle/fluid/operators/math/jit_code.h" @@ -35,38 +35,6 @@ namespace math { namespace jitkernel { namespace jit = platform::jit; -// TODO(TJ): move refer codes to one file -// Refer code only focus on correctness -template -void VExpRefer(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -void VSigmoidRefer(const T* x, T* y, int n) { - // y = 1 / (1 + e^-x) - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); - } -} - -template -void VTanhRefer(const T* x, T* y, int n) { - // y = 2 * sigmoid(2x) - 1 - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoidRefer(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup template @@ -129,7 +97,7 @@ class VExpKernelImpl : public VExpKernel { return; } #endif - this->Compute = VExpRefer; + this->Compute = refer::VExp; } #ifdef PADDLE_WITH_XBYAK @@ -182,7 +150,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel { return; } #endif - this->Compute = VSigmoidRefer; + this->Compute = refer::VSigmoid; } #ifdef PADDLE_WITH_XBYAK @@ -234,7 +202,7 @@ class VTanhKernelImpl : public VTanhKernel { return; } #endif - this->Compute = VTanhRefer; + this->Compute = refer::VTanh; } #ifdef PADDLE_WITH_XBYAK diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index fcb6a7c097..337d5ae914 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -38,9 +38,13 @@ typedef struct { void* checked{nullptr}; } lstm_t; -typedef struct { +typedef struct lstm_attr_s { int d; std::string act_gate, act_cand, act_cell; + lstm_attr_s() = default; + lstm_attr_s(int _d, const std::string& _act_gate, + const std::string& _act_cand, const std::string& _act_cell) + : d(_d), act_gate(_act_gate), act_cand(_act_cand), act_cell(_act_cell) {} } lstm_attr_t; } // namespace jitkernel diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h new file mode 100644 index 0000000000..9c60ebc587 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -0,0 +1,171 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/operators/math/jit_kernel_impl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace refer { +/* Refer code only focus on correctness */ + +template +void VMul(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +template +void VAdd(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +template +void VAddRelu(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } +} + +template +void VScal(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] * x[i]; + } +} + +template +void VAddBias(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] + x[i]; + } +} + +template +void VRelu(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template +inline void VIdentity(const T* x, T* y, int n) {} + +template +void VExp(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +void VSigmoid(const T* x, T* y, int n) { + // y = 1 / (1 + e^-x) + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); + } +} + +template +void VTanh(const T* x, T* y, int n) { + // y = 2 * sigmoid(2x) - 1 + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + +template +void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT + if (type == "sigmoid") { + return VSigmoid; + } else if (type == "relu") { + return VRelu; + } else if (type == "tanh") { + return VTanh; + } else if (type == "identity" || type == "") { + return VIdentity; + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +template +void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + // gates: W_ch, W_ih, W_fh, W_oh + act_gate(gates + d, gates + d, d3); + + /* C_t = C_t-1 * fgated + cand_gated * igated */ + act_cand(gates, gates, d); + VMul(gates, gates + d, gates + d, d); + VMul(ct_1, gates + d2, gates + d2, d); + VAdd(gates + d, gates + d2, ct, d); + + /* H_t = act_cell(C_t) * ogated */ + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + +template +void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + /* C_t = igated * cgated*/ + act_gate(gates + d, gates + d, d); + act_cand(gates, gates, d); + VMul(gates, gates + d, ct, d); + /* H_t = act_cell(C_t) * ogated */ + act_gate(gates + d3, gates + d3, d); + act_cell(ct, gates + d2, d); + Vmul(gates + d2, gates + d3, ht, d); +} + +} // namespace refer +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index b6c62a2634..a1705a81c4 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" @@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), } } -void vrelu_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0.f ? x[i] : 0.f; - } -} - #if defined __AVX__ || defined __AVX2__ void vrelu_intri8(const int n, const float* x, float* y) { __m256 tmp = _mm256_loadu_ps(x); @@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) { TEST(JitKernel, vrelu) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vrelu_ref(d, x_data, zref_data); + refer::VRelu(x_data, zref_data, d); } auto trefe = GetCurrentUS(); #if defined __AVX__ || defined __AVX2__ @@ -107,14 +103,9 @@ TEST(JitKernel, vrelu) { } } -void vaddbias_ref(const int n, const float a, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] + a; - } -} - TEST(JitKernel, vaddbias) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddbias_ref(d, a, x_data, zref_data); + refer::VAddBias(&a, x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -145,12 +136,6 @@ TEST(JitKernel, vaddbias) { } } -void vexp_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - #ifdef PADDLE_WITH_MKLML void vexp_mkl(const int n, const float* x, float* y) { paddle::platform::dynload::vsExp(n, x, y); @@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) { TEST(JitKernel, vexp) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -170,7 +156,7 @@ TEST(JitKernel, vexp) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vexp_ref(d, x_data, zref_data); + refer::VExp(x_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -203,19 +189,6 @@ TEST(JitKernel, vexp) { } } -inline float _sigmoid(float x) { - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - float tmp = (x < min) ? min : ((x > max) ? max : x); - return 1.f / (1.f + std::exp(-tmp)); -} - -void vsigmoid_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = _sigmoid(x[i]); - } -} - void vsigmoid_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VExpKernel>& vexp, @@ -234,6 +207,7 @@ void vsigmoid_better( TEST(JitKernel, vsigmoid) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vsigmoid_ref(d, x_data, zref_data); + refer::VSigmoid(x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -271,14 +245,6 @@ TEST(JitKernel, vsigmoid) { } } -inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; } - -void vtanh_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = _tanh(x[i]); - } -} - void vtanh_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VScalKernel>& vscal, @@ -298,6 +264,7 @@ void vtanh_better( TEST(JitKernel, vtanh) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vtanh_ref(d, x_data, zref_data); + refer::VTanh(x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -339,32 +306,6 @@ TEST(JitKernel, vtanh) { } } -void lstm_ctht_ref( - const std::shared_ptr< - const paddle::operators::math::jitkernel::VSigmoidKernel>& - vsigmoid_3d, - const std::shared_ptr< - const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, - const std::shared_ptr< - const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, - const int d, float* gates, const float* ct_1, float* ct, float* ht) { - vsigmoid_3d->Compute(gates + d, gates + d, 3 * d); - vtanh_d->Compute(gates, gates, d); - const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - for (int k = 0; k < d; ++k) { - // C_t = C_t-1 * fgated + cand_gated * igated - ct[k] = ct_1[k] * f[k] + gates[k] * i[k]; - // H_t = act_cell(C_t) * ogated - float tmp = ct[k] * 2; - tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vexp_1->Compute(&tmp, &tmp, 1); - tmp = 2.f / (1.f + tmp) - 1.f; - ht[k] = tmp * o[k]; - } -} - void lstm_ctht_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VSigmoidKernel>& @@ -389,6 +330,7 @@ void lstm_ctht_better( TEST(JitKernel, lstm) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) { int d4 = d * 4; int d3 = d * 3; @@ -410,8 +352,6 @@ TEST(JitKernel, lstm) { d3); const auto& vtanh_d = jit::KernelPool::Instance().template Get>(d); - const auto& vexp_1 = - jit::KernelPool::Instance().template Get>(1); const auto& vmul_d = jit::KernelPool::Instance().template Get>(d); const auto& vadd_d = @@ -425,8 +365,14 @@ TEST(JitKernel, lstm) { float* ct_ref_data = ct_ref.data(); float* ht_ref_data = ht_ref.data(); // compute once to check correctness - lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, - ct_ref_data, ht_ref_data); + jit::lstm_t step; + jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell); + step.gates = xref_data; + step.ct_1 = ct_1_data; + step.ct = ct_ref_data; + step.ht = ht_ref_data; + refer::LSTMCtHt(&step, &attr); + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); for (int i = 0; i < d; ++i) { EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); @@ -441,8 +387,7 @@ TEST(JitKernel, lstm) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, - ct_ref_data, ht_ref_data); + refer::LSTMCtHt(&step, &attr); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -457,16 +402,6 @@ TEST(JitKernel, lstm) { } } -void vscal_ref(const int n, const float a, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = a * x[i]; - } -} -void vscal_inp_ref(const int n, const float a, float* x) { - for (int i = 0; i < n; ++i) { - x[i] = a * x[i]; - } -} #if defined __AVX__ || defined __AVX2__ void vscal_intri8(const int n, const float a, const float* x, float* y) { __m256 tmp; @@ -492,6 +427,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) { TEST(JitKernel, vscal) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -506,12 +442,12 @@ TEST(JitKernel, vscal) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vscal_ref(d, a, x_data, zref_data); + refer::VScal(&a, x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto trefs1 = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vscal_inp_ref(d, a, y_data); + refer::VScal(&a, y_data, y_data, d); } auto trefe1 = GetCurrentUS(); @@ -567,12 +503,6 @@ TEST(JitKernel, vscal) { } } -void vmul_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - #if defined __AVX__ || defined __AVX2__ void vmul_intri8(const int n, const float* x, const float* y, float* z) { __m256 tmpx, tmpy; @@ -591,6 +521,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vmul) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -604,7 +535,7 @@ TEST(JitKernel, vmul) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vmul_ref(d, x_data, y_data, zref_data); + refer::VMul(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -647,12 +578,6 @@ TEST(JitKernel, vmul) { } } -void vadd_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - #if defined __AVX__ || defined __AVX2__ void vadd_intri8(const int n, const float* x, const float* y, float* z) { __m256 tmpx, tmpy; @@ -671,6 +596,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vadd) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -684,7 +610,7 @@ TEST(JitKernel, vadd) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vadd_ref(d, x_data, y_data, zref_data); + refer::VAdd(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -727,12 +653,6 @@ TEST(JitKernel, vadd) { } } -void vaddrelu_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} void vaddrelu_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VAddKernel>& vadd, @@ -745,6 +665,7 @@ void vaddrelu_better( TEST(JitKernel, vaddrelu) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -762,7 +683,7 @@ TEST(JitKernel, vaddrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddrelu_ref(d, x_data, y_data, zref_data); + refer::VAddRelu(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); auto tmkls = GetCurrentUS(); From 703b26e697c0a15a903d2e346d191e033e181073 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 11:22:34 +0800 Subject: [PATCH 036/113] add profiler, parallel_executor back --- paddle/fluid/framework/CMakeLists.txt | 9 - .../fast_threaded_ssa_graph_executor.h | 2 +- .../fluid/memory/allocation/cpu_allocator.h | 3 +- paddle/fluid/platform/CMakeLists.txt | 12 +- paddle/fluid/platform/device_tracer.h | 12 +- paddle/fluid/platform/enforce.h | 2 +- paddle/fluid/platform/port.h | 21 + paddle/fluid/platform/profiler.cc | 6 +- paddle/fluid/platform/profiler.h | 10 - .../fluid/platform/stream_callback_manager.h | 13 +- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/pybind.cc | 6 - python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/parallel_executor.py | 497 +++++++++--------- 14 files changed, 293 insertions(+), 308 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 42af482f85..43e1bc6b2e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -31,9 +31,7 @@ function(windows_symbolic TARGET) endfunction() add_subdirectory(ir) -if (NOT WIN32) add_subdirectory(details) -endif (NOT WIN32) # ddim lib proto_library(framework_proto SRCS framework.proto) @@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) -if (NOT WIN32) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) -else() -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor) -endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -179,12 +172,10 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -if (NOT WIN32) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor) -endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 949616f02d..c3a8b85423 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -13,9 +13,9 @@ // limitations under the License. #pragma once +#include #include #include -#include "ThreadPool.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 165f11cd3b..26d3643f4e 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -17,7 +17,8 @@ #ifdef _WIN32 #define posix_memalign_free _aligned_free -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) #endif namespace paddle { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 0d0613e1a4..93cb5eb2dc 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,3 @@ -if (NOT WIN32) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _ add_dependencies(profiler_py_proto profiler_py_proto_init) +if (NOT WIN32) add_custom_command(TARGET profiler_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) +string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") +add_custom_command(TARGET profiler_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) if(WITH_GPU) @@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) - -if (NOT WIN32) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) -endif(NOT WIN32) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index f59fc40b71..eaf047d474 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,17 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 - -#include #include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" namespace paddle { @@ -32,15 +26,11 @@ namespace platform { /////////////////////// // WARN: Under Development. Don't depend on it yet. ////////////////////// -#if !defined(_WIN32) inline uint64_t PosixInNsec() { struct timeval tv; gettimeofday(&tv, nullptr); return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); } -#else -inline uint64_t PosixInNsec() { return static_cast(0); } -#endif // !_WIN32 // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3643d2ad15..31309738a5 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -134,7 +134,7 @@ struct EOFException : public std::exception { #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) !(condition) +#define LIKELY(condition) (condition) #endif template diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index a07b993c8a..8be77fe464 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -27,6 +28,7 @@ #include // dladdr #include // backtrace #include +#include #include // std::accumulate #else #include // _popen, _pclose @@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +static int gettimeofday(struct timeval *tp, void *tzp) { + time_t clock; + struct tm tm; + SYSTEMTIME wtm; + + GetLocalTime(&wtm); + tm.tm_year = wtm.wYear - 1900; + tm.tm_mon = wtm.wMonth - 1; + tm.tm_mday = wtm.wDay; + tm.tm_hour = wtm.wHour; + tm.tm_min = wtm.wMinute; + tm.tm_sec = wtm.wSecond; + tm.tm_isdst = -1; + clock = mktime(&tm); + tp->tv_sec = clock; + tp->tv_usec = wtm.wMilliseconds * 1000; + + return (0); +} #endif // !_WIN32 static void ExecShellCommand(const std::string &cmd, std::string *message) { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 56bf9e31a3..03c102e24a 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/port.h" -#include #include #include #include @@ -438,10 +438,10 @@ void ParseEvents(const std::vector>& events, event_items[index].total_time += event_time; // min time event_items[index].min_time = - std::min(event_time, event_items[index].min_time); + (std::min)(event_time, event_items[index].min_time); // max time event_items[index].max_time = - std::max(event_time, event_items[index].max_time); + (std::max)(event_time, event_items[index].max_time); } // remove the push marker from the list diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index e8eae874af..f5d3490634 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx); -#if !defined(_WIN32) struct RecordEvent { // dev_ctx can be set to nullptr if device is cpu. RecordEvent(const std::string& name, const DeviceContext* dev_ctx); @@ -106,15 +105,6 @@ struct RecordBlock { std::string name_; uint64_t start_ns_; }; -#else -// windows do not support profiler temporarily. -struct RecordEvent { - RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {} -}; -struct RecordBlock { - explicit RecordBlock(int block_id) {} -}; -#endif // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0e88a439cf..11c68f3449 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -45,16 +45,15 @@ class StreamCallbackManager { inline void AddCallback(Callback &&callback) const { auto *stream_callback_context = new StreamCallbackContext(this, std::forward(callback)); - PADDLE_ENFORCE( #if CUDA_VERSION >= 10000 - cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context) + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); // NOLINT #else - cudaStreamAddCallback(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0) + PADDLE_ENFORCE(cudaStreamAddCallback( + stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); // NOLINT #endif - ); // NOLINT } void Wait() const { thread_pool_.reset(new ThreadPool(1)); } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 25e919105c..fb6ee2f4a5 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) -if(NOT WIN32) - list(APPEND PYBIND_DEPS parallel_executor profiler) -endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2f040e1c34..102fa02adf 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,9 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" -#ifndef _WIN32 #include "paddle/fluid/framework/parallel_executor.h" -#endif #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif -#ifndef _WIN32 py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); -#endif py::class_> pass(m, "Pass"); pass.def(py::init()) @@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. .def("remove_pass", [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); -#ifndef _WIN32 // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( @@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(fetch_tensors, fetched_var_name); }); -#endif BindRecordIOWriter(&m); return m.ptr(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 6a4a5e098f..543acf2d34 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -47,8 +47,7 @@ from . import profiler from . import unique_name from . import recordio_writer from . import parallel_executor -if os.name != 'nt': - from .parallel_executor import * +from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 0d53f53a9e..3f4dd5eb71 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -25,264 +25,263 @@ import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] -if os.name != 'nt': - ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy - BuildStrategy = core.ParallelExecutor.BuildStrategy - - class ParallelExecutor(object): +ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy +BuildStrategy = core.ParallelExecutor.BuildStrategy + + +class ParallelExecutor(object): + """ + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. + + Args: + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provide, it will share variables + from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int): Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + scope(Scope): scope to run with, default use fluid.global_scope(). + + Returns: + ParallelExecutor: The initialized ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) + """ + + def __init__(self, + use_cuda, + loss_name=None, + main_program=None, + share_vars_from=None, + exec_strategy=None, + build_strategy=None, + num_trainers=1, + trainer_id=0, + scope=None): + self._places = [] + self._act_places = [] + if use_cuda: + for i in six.moves.range(core.get_cuda_device_count()): + p = core.Place() + self._act_places.append(core.CUDAPlace(i)) + p.set_place(self._act_places[-1]) + self._places.append(p) + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in six.moves.range(cpu_num): + p = core.Place() + self._act_places.append(core.CPUPlace()) + p.set_place(self._act_places[-1]) + self._places.append(p) + assert self._places, "no place for execution" + + if exec_strategy is None: + exec_strategy = ExecutionStrategy() + exec_strategy.use_cuda = use_cuda + + if exec_strategy.num_threads == 0: + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num * 2 + + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + + if build_strategy is None: + build_strategy = BuildStrategy() + + main = main_program + main = main if main else framework.default_main_program() + if scope == None: + scope = executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + self.persistable_vars = [ + v.name for v in [ + var for var in main.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ] + + self.executor = core.ParallelExecutor( + self._places, + set([ + cpt.to_text(p.name) + for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + cpt.to_text(loss_name) + if loss_name else six.u(''), scope, local_scopes, exec_strategy, + build_strategy, num_trainers, trainer_id) + self.scope = scope + + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): """ - ParallelExecutor is designed for data parallelism, which focuses on distributing - the data across different nodes and every node operates on the data in parallel. - If you use ParallelExecutor to run the current program on GPU, the node means GPU - device, and ParallelExecutor will get the available GPU device automatically on - the current machine. If you use ParallelExecutor to run the current program on CPU, - the node means the CPU device, and you can specify the CPU device number by adding - 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable - is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number - of CPUs in the system. + Run a parallel executor with fetch_list. + + The feed parameter can be a dict or a list. If feed is a dict, the + feed data will be split into multiple devices. If feed is a list, we + assume the data has been splitted into multiple devices, the each + element in the list will be copied to each device directly. + + For example, if the feed is a dict: + + >>> exe = ParallelExecutor() + >>> # the image will be splitted into devices. If there is two devices + >>> # each device will process an image with shape (24, 1, 28, 28) + >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) + + For example, if the feed is a list: + + >>> exe = ParallelExecutor() + >>> # each device will process each element in the list. + >>> # the 1st device will process an image with shape (48, 1, 28, 28) + >>> # the 2nd device will process an image with shape (32, 1, 28, 28) + >>> # + >>> # you can use exe.device_count to get the device number. + >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, + >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, + >>> ]) Args: - use_cuda (bool): Whether to use CUDA or not. - loss_name (str): The loss name must set in training. Default None. - main_program (Program): The program that need to run, if not provided, - then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provide, it will share variables - from the specified ParallelExecutor. Default None. - exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run - the program in ParallelExecutor, for example how many threads are used to - execute the program, how many iterations to clean up the temp variables - which is generated during execution. For more information, please refer - to fluid.ExecutionStrategy. Default None. - build_strategy(BuildStrategy): build_strategy is used to control how to - build the SSA Graph in ParallelExecutor by setting the property, - for example reduce_strategy, gradient_scale_strategy. For more information, - please refer to fluid.BuildStrategy. Default None. - num_trainers(int): If greater than 1, NCCL will be initialized with - multiple rank of nodes, each node should have same number of GPUs. - Distributed training will be enabled then. Default 1. - trainer_id(int): Must use together with num_trainers. trainer_id is the - "rank" of current node starts from 0. Default 0. - scope(Scope): scope to run with, default use fluid.global_scope(). + fetch_list(list): The fetched variable names + feed(list|dict|None): The feed variables. If the feed is a dict, + tensors in that dict will be splitted into each devices. If + the feed is a list, each element of the list will be copied + to each device. Default None. + feed_dict: Alias for feed parameter, for backward compatibility. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. Returns: - ParallelExecutor: The initialized ParallelExecutor object. + List: The fetched result list. Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor object. + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. Examples: .. code-block:: python - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) """ - - def __init__(self, - use_cuda, - loss_name=None, - main_program=None, - share_vars_from=None, - exec_strategy=None, - build_strategy=None, - num_trainers=1, - trainer_id=0, - scope=None): - self._places = [] - self._act_places = [] - if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) - assert self._places, "no place for execution" - - if exec_strategy is None: - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_cuda - - if exec_strategy.num_threads == 0: - if use_cuda: - # Experiments on se-resnext shows that too many threads hurt - # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num * 2 - - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - - if build_strategy is None: - build_strategy = BuildStrategy() - - main = main_program - main = main if main else framework.default_main_program() - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ - var for var in main.list_vars() - if var.persistable and var.type != core.VarDesc.VarType.RAW - ] - ] - - self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) - for var in self.persistable_vars), main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) - self.scope = scope - - def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): - """ - Run a parallel executor with fetch_list. - - The feed parameter can be a dict or a list. If feed is a dict, the - feed data will be split into multiple devices. If feed is a list, we - assume the data has been splitted into multiple devices, the each - element in the list will be copied to each device directly. - - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) - - For example, if the feed is a list: - - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) - - Args: - fetch_list(list): The fetched variable names - feed(list|dict|None): The feed variables. If the feed is a dict, - tensors in that dict will be splitted into each devices. If - the feed is a list, each element of the list will be copied - to each device. Default None. - feed_dict: Alias for feed parameter, for backward compatibility. - This parameter has been deprecated. Default None. - return_numpy(bool): Whether converts the fetched tensor to numpy. - Default: True. - - Returns: - List: The fetched result list. - - Raises: - ValueError: If the feed is a list, but its length is not equal the - length of active places, or its element's is not dict. - - NOTES: - 1. If the feed's type is dict, the number of data that feeds to - ParallelExecutor must be bigger than active places. Otherwise, - it will throw exception from C++ side. Special attention should be - paid to check whether the last batch of the dataset is bigger - than active places. - 2. If active places are more than one, the fetch results for each - variable is a list, and each element of this list is the variable of - respective active place. - - Examples: - .. code-block:: python - - pe = fluid.ParallelExecutor(use_cuda=use_cuda, - loss_name=avg_cost.name, - main_program=fluid.default_main_program()) - loss = pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name])) - """ - if feed is None and feed_dict is not None: - feed = feed_dict - print( - "`feed_dict` is deprecated. Please use `feed=`", - file=sys.stderr) - - if isinstance(feed, dict): - feed_tensor_dict = dict() - for feed_name in feed: - feed_tensor = feed[feed_name] - if not isinstance(feed_tensor, core.LoDTensor): - feed_tensor = core.LoDTensor() - # always set to CPU place, since the tensor need to be splitted - # it is fast in CPU - feed_tensor.set(feed[feed_name], core.CPUPlace()) - feed_tensor_dict[feed_name] = feed_tensor - - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) - elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): - raise ValueError( - "Feed a list of tensor, the list should be the same size as places" - ) - - res = list() - - for i, each in enumerate(feed): - if not isinstance(each, dict): - raise TypeError( - "Each element of feed list should be a dict") - res_dict = dict() - for feed_name in each: - tensor = each[feed_name] - if not isinstance(tensor, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) - tensor = tmp - res_dict[feed_name] = tensor - res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) - - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) - arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - - if return_numpy: - return executor.as_numpy(arr) - - return [arr[i] for i in range(len(arr))] - - @property - def device_count(self): - return len(self._act_places) + if feed is None and feed_dict is not None: + feed = feed_dict + print( + "`feed_dict` is deprecated. Please use `feed=`", + file=sys.stderr) + + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._act_places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._act_places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + self.executor.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return executor.as_numpy(arr) + + return [arr[i] for i in range(len(arr))] + + @property + def device_count(self): + return len(self._act_places) From 6e66fadb951fe02218ab2be2916bc12c4b966e00 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 15:24:23 +0800 Subject: [PATCH 037/113] clean up the pre-definitions on windows --- CMakeLists.txt | 2 ++ cmake/operators.cmake | 3 +-- paddle/fluid/framework/eigen.h | 5 ----- paddle/fluid/framework/op_registry.h | 5 ----- paddle/fluid/framework/operator.cc | 2 -- paddle/fluid/framework/operator.h | 2 -- paddle/fluid/inference/api/api_impl.h | 6 ------ paddle/fluid/platform/cpu_helper.cc | 1 + paddle/fluid/platform/dynload/cudnn.h | 2 -- paddle/fluid/platform/enforce.h | 6 ------ paddle/fluid/platform/init.h | 3 --- paddle/fluid/platform/port.h | 4 ++++ paddle/fluid/platform/profiler.cc | 4 ++-- paddle/fluid/pybind/pybind.cc | 7 ------- 14 files changed, 10 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27f2d81dd5..5325e3034c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,6 +137,8 @@ if (WIN32) "Disable DSO when compiling for Windows" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling for Windows" FORCE) + set(WITH_DISTRIBUTE OFF CACHE STRING + "Disable DISTRIBUTE when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 0bc4dbe6cf..17107e0698 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,8 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" - "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h index 2b265a773f..5bafa4345f 100644 --- a/paddle/fluid/framework/eigen.h +++ b/paddle/fluid/framework/eigen.h @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -// logging.h and windows.h conflict -#define GLOG_NO_ABBREVIATED_SEVERITIES -// solve static linking error in windows -// https://github.com/google/glog/issues/301 -#define GOOGLE_GLOG_DLL_DECL #include "paddle/fluid/framework/tensor.h" #include "unsupported/Eigen/CXX11/Tensor" diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index ef2eb334a4..0e6e74293c 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,11 +23,6 @@ limitations under the License. */ #include #include -#if defined(_WIN32) -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#define GOOGLE_GLOG_DLL_DECL -#endif - #include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2b35943d09..1ec170b6f6 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL #include #include diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 6918e030bf..ef83833217 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -20,8 +20,6 @@ limitations under the License. */ #include #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 4e4ab47ca9..9dfa48d501 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -14,12 +14,6 @@ limitations under the License. */ #pragma once -// logging.h and windows.h conflict -#define GLOG_NO_ABBREVIATED_SEVERITIES -// solve static linking error in windows -// https://github.com/google/glog/issues/301 -#define GOOGLE_GLOG_DLL_DECL - #include #include #include diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index bd6aedb3ac..f2d691b293 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -30,6 +30,7 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS // windows has no support for openblas multi-thread +// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234 #ifdef _WIN32 if (num_threads > 1) { num_threads = 1; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 065b940b9c..1a83ac7780 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL #include #include diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 31309738a5..a85972bdb7 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -18,12 +18,6 @@ limitations under the License. */ #include // for __cxa_demangle #endif // __GNUC__ -#if defined(_WIN32) -#define NOMINMAX // msvc max/min macro conflict with std::min/max -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#define GOOGLE_GLOG_DLL_DECL -#endif - #ifdef PADDLE_WITH_CUDA #include #include diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 992ca5e6f6..0e30594672 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -16,9 +16,6 @@ limitations under the License. */ #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL - #include "gflags/gflags.h" #include "glog/logging.h" diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 8be77fe464..ad070171df 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -31,6 +31,10 @@ #include #include // std::accumulate #else +#define NOMINMAX // msvc max/min macro conflict with std::min/max +// solve static linking error in windows +// https://github.com/google/glog/issues/301 +#define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include #include diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 03c102e24a..998242fb4a 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -438,10 +438,10 @@ void ParseEvents(const std::vector>& events, event_items[index].total_time += event_time; // min time event_items[index].min_time = - (std::min)(event_time, event_items[index].min_time); + std::min(event_time, event_items[index].min_time); // max time event_items[index].max_time = - (std::max)(event_time, event_items[index].max_time); + std::max(event_time, event_items[index].max_time); } // remove the push marker from the list diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 102fa02adf..6cc3a1739a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,13 +21,6 @@ limitations under the License. */ #include #include -#if defined(_WIN32) -#define NOMINMAX -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#define GOOGLE_GLOG_DLL_DECL -#include -#endif - #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" From c19ff1f3d28b38867de8b98d63f19b8c759c4535 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 21 Nov 2018 15:37:36 +0800 Subject: [PATCH 038/113] Add python3.6 and python3.7 support in padde build scripts test=develop --- paddle/scripts/paddle_build.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 32f9bca645..569e56e5a9 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -116,6 +116,18 @@ function cmake_gen() { export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + elif [ "$1" == "cp36-cp36m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so" + elif [ "$1" == "cp37-cp37m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" fi fi fi @@ -419,7 +431,7 @@ function assert_api_not_changed() { source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec - if [ "$1" == "cp35-cp35m" ]; then + if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec From 255cc1eb6540785c8cb786a6c9f291fa53010ca0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 21 Nov 2018 15:43:17 +0800 Subject: [PATCH 039/113] Add support for Mac build test=develop --- paddle/scripts/paddle_build.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 569e56e5a9..9632eaec00 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -94,6 +94,30 @@ function cmake_gen() { else exit 1 fi + elif [ "$1" == "cp36-cp36m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi + elif [ "$1" == "cp37-cp37m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi fi else if [ "$1" != "" ]; then From f913860873781ff4ccc9ee2eba73365d530fae22 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 21 Nov 2018 08:47:12 +0000 Subject: [PATCH 040/113] jitkernel lstm refer support peephole test=develop --- .../fluid/operators/fused/fusion_lstm_op.cc | 73 +++-- paddle/fluid/operators/math/jit_code.cc | 6 +- paddle/fluid/operators/math/jit_code.h | 42 ++- paddle/fluid/operators/math/jit_kernel.h | 15 +- paddle/fluid/operators/math/jit_kernel_impl.h | 14 +- .../fluid/operators/math/jit_kernel_macro.h | 8 +- .../fluid/operators/math/jit_kernel_refer.h | 35 ++- paddle/fluid/operators/math/jit_kernel_rnn.cc | 288 +++++++----------- .../fluid/operators/math/jit_kernel_test.cc | 32 +- 9 files changed, 250 insertions(+), 263 deletions(-) diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 0959539068..8021a896ce 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -236,27 +236,31 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, const std::string&, \ - const std::string&, const std::string&>( \ - ctx.Attr("gate_activation"), \ - ctx.Attr("candidate_activation"), \ - ctx.Attr("cell_activation"), D, use_peepholes) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const math::jitkernel::lstm_attr_t attr( \ + D, ctx.Attr("gate_activation"), \ + ctx.Attr("candidate_activation"), \ + ctx.Attr("cell_activation"), use_peepholes); \ + math::jitkernel::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const math::jitkernel::lstm_attr_t&>(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_h_data = h0_data + bid * D; prev_c_data = c0_data + bid * D; } else { - ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data); + one_step.gates = xx_data; + one_step.ct = c_out_data; + one_step.ht = h_out_data; + ker->ComputeC1H1(&one_step, &attr); tstart = 1; // move one step prev_h_data = h_out_data; @@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel { } for (int step = tstart; step < seq_len; ++step) { GEMM_WH_ADDON(1, prev_h_data, xx_data); - ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data, - checked_cell_data); + + one_step.gates = xx_data; + one_step.ct_1 = prev_c_data; + one_step.ct = c_out_data; + one_step.ht = h_out_data; + ker->ComputeCtHt(&one_step, &attr); // move one step prev_h_data = h_out_data; prev_c_data = c_out_data; @@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; for (int i = 0; i < max_bs; ++i) { - ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data); + one_step.gates = cur_in_data; + one_step.ct = cur_c_out_data; + one_step.ht = cur_h_out_data; + ker->ComputeC1H1(&one_step, &attr); + cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; @@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_c_out_data = batched_c_out_data; T* cur_h_out_data = batched_h_out_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data, wp_data, checked_cell_data); + one_step.gates = cur_in_data; + one_step.ct_1 = cur_prev_c_data; + one_step.ct = cur_c_out_data; + one_step.ht = cur_h_out_data; + ker->ComputeCtHt(&one_step, &attr); + // move one batch cur_in_data += D4; cur_prev_c_data += D; diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 418c843362..ccc9206f5c 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -233,7 +233,7 @@ void LSTMJitCode::generate() { vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]); act(ymm_i, ymm_src, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); - if (first_) { + if (!compute_c1h1_) { // f vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]); act(ymm_f, ymm_src, act_gate_); @@ -242,8 +242,8 @@ void LSTMJitCode::generate() { vaddps(ymm_f, ymm_f, ymm_c); } /* H_t = act_cell(C_t) * ogated */ - ymm_t ymm_ct = first_ ? ymm_c : ymm_f; - ymm_t ymm_o = first_ ? ymm_f : ymm_c; + ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; + ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; act(ymm_tmp, ymm_ct, act_cell_); vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]); diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 9782f5414c..bf28d444b7 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -319,6 +319,12 @@ class LSTMJitCode : public VActJitCode { public: const char* name() const override { std::string base = "LSTMJitCode"; + if (use_peephole_) { + base += "_Peephole"; + } + if (compute_c1h1_) { + base += "_C1H1"; + } auto AddTypeStr = [&](operand_type type) { switch (type) { case operand_type::relu: @@ -340,30 +346,42 @@ class LSTMJitCode : public VActJitCode { break; } }; - if (first_) { - base += "_C1H1"; - } AddTypeStr(act_gate_); AddTypeStr(act_cand_); AddTypeStr(act_cell_); return base.c_str(); } - explicit LSTMJitCode(int d, bool first, operand_type act_gate, - operand_type act_cand, operand_type act_cell, + explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr, size_t code_size = 256 * 1024, void* code_ptr = nullptr) - : VActJitCode(d, act_gate, code_size, code_ptr), - num_(d), - first_(first), - act_gate_(act_gate), - act_cand_(act_cand), - act_cell_(act_cell) {} + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + compute_c1h1_(compute_c1h1) { + auto typeExchange = [](const std::string& type) -> gen::operand_type { + if (type == "sigmoid") { + return operand_type::sigmoid; + } else if (type == "relu") { + return operand_type::relu; + } else if (type == "tanh") { + return operand_type::tanh; + } else if (type == "identity" || type == "") { + return operand_type::identity; + } // else throw error + return operand_type::identity; + }; + num_ = attr.d; + use_peephole_ = attr.use_peephole; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + act_cell_ = typeExchange(attr.act_cell); + } static bool init(int d); void generate() override; protected: int num_; - bool first_; + bool compute_c1h1_; + bool use_peephole_; operand_type act_gate_; operand_type act_cand_; operand_type act_cell_; diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 36199eddaf..bb5ba5813a 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -122,18 +122,9 @@ class VTanhKernel : public VActKernel {}; template class LSTMKernel : public Kernel { public: - virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht, - /* below only used in peephole*/ - const T *wp_data = nullptr, - T *checked = nullptr) const = 0; - - virtual void ComputeC1H1(T *gates, T *ct, T *ht, - /* below only used in peephole*/ - const T *wp_data = nullptr) const = 0; - - // void (*ComputeCtHt)(lstm_t *); - // // compute c1 and h1 without c0 or h0 - // void (*ComputeC1H1)(lstm_t *); + void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); + // compute c1 and h1 without c0 or h0 + void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index 337d5ae914..2e734ca940 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -33,18 +33,24 @@ typedef struct { const void* ct_1; void* ct; void* ht; - /* below only used in peephole*/ - const void* wp_data{nullptr}; + /* weight_peephole and checked data are only used in peephole*/ + const void* wp{nullptr}; void* checked{nullptr}; } lstm_t; typedef struct lstm_attr_s { + bool use_peephole; int d; std::string act_gate, act_cand, act_cell; lstm_attr_s() = default; lstm_attr_s(int _d, const std::string& _act_gate, - const std::string& _act_cand, const std::string& _act_cell) - : d(_d), act_gate(_act_gate), act_cand(_act_cand), act_cell(_act_cell) {} + const std::string& _act_cand, const std::string& _act_cell, + bool _use_peephole = false) + : use_peephole(_use_peephole), + d(_d), + act_gate(_act_gate), + act_cand(_act_cand), + act_cell(_act_cell) {} } lstm_attr_t; } // namespace jitkernel diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 8acf60cfbf..5a3efd979f 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -82,10 +82,10 @@ namespace jitkernel { #define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \ marco_declare, macro_find_key, macro_impl) \ marco_define_name(ker_key, ker_class); \ - REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE, \ - JITKERNEL_FIND_KEY, JITKERNEL_IMPL); \ - REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE, \ - JITKERNEL_FIND_KEY, JITKERNEL_IMPL) + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare, \ + macro_find_key, macro_impl); \ + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare, \ + macro_find_key, macro_impl) #define REGISTER_JITKERNEL(ker_key, ker_class) \ REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \ diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 9c60ebc587..097bb85956 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -117,11 +117,13 @@ void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT } template -void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) { +void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { T* gates = reinterpret_cast(step->gates); const T* ct_1 = reinterpret_cast(step->ct_1); T* ct = reinterpret_cast(step->ct); T* ht = reinterpret_cast(step->ht); + const T* wp = reinterpret_cast(step->wp); + T* checked = reinterpret_cast(step->checked); auto act_gate = getActFunc(attr->act_gate); auto act_cand = getActFunc(attr->act_cand); auto act_cell = getActFunc(attr->act_cell); @@ -129,23 +131,36 @@ void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) { int d2 = d * 2; int d3 = d * 3; // gates: W_ch, W_ih, W_fh, W_oh - act_gate(gates + d, gates + d, d3); + if (attr->use_peephole) { + VMul(wp, ct_1, checked, d); + VMul(wp + d, ct_1, checked + d, d); + VAdd(checked, gates + d, gates + d, d2); + act_gate(gates + d, gates + d, d2); + } else { + act_gate(gates + d, gates + d, d3); + } - /* C_t = C_t-1 * fgated + cand_gated * igated */ + // C_t = C_t-1 * fgated + cand_gated * igated act_cand(gates, gates, d); VMul(gates, gates + d, gates + d, d); VMul(ct_1, gates + d2, gates + d2, d); VAdd(gates + d, gates + d2, ct, d); - /* H_t = act_cell(C_t) * ogated */ + if (attr->use_peephole) { + // get ogated + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + act_gate(gates + d3, gates + d3, d); + } + // H_t = act_cell(C_t) * ogated act_cell(ct, gates + d2, d); VMul(gates + d2, gates + d3, ht, d); } +// compute c1 and h1 without c0 or h0 template -void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) { +void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { T* gates = reinterpret_cast(step->gates); - const T* ct_1 = reinterpret_cast(step->ct_1); T* ct = reinterpret_cast(step->ct); T* ht = reinterpret_cast(step->ht); auto act_gate = getActFunc(attr->act_gate); @@ -158,10 +173,16 @@ void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) { act_gate(gates + d, gates + d, d); act_cand(gates, gates, d); VMul(gates, gates + d, ct, d); + if (attr->use_peephole) { + // get outgated, put W_oc * C_t on igated + const T* wp = reinterpret_cast(step->wp); + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + } /* H_t = act_cell(C_t) * ogated */ act_gate(gates + d3, gates + d3, d); act_cell(ct, gates + d2, d); - Vmul(gates + d2, gates + d3, ht, d); + VMul(gates + d2, gates + d3, ht, d); } } // namespace refer diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index e79b0400ab..6b7463aa52 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -15,9 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_XBYAK +#include "paddle/fluid/operators/math/jit_code.h" +#endif + #ifdef __AVX__ #include #endif @@ -154,211 +159,136 @@ static std::unique_ptr GetAVXAct(const std::string& type) { #endif /* LSTM JitKernel */ -template +template class LSTMKernelImpl : public LSTMKernel { public: - explicit LSTMKernelImpl(const std::string& act_gate, - const std::string& act_cand, - const std::string& act_cell, int d) - : LSTMKernel() { - d_ = d; - d2_ = d * 2; - d3_ = d * 3; - act_gate_d3_ = GetActKernel(act_gate, d3_); - act_gate_d_ = GetActKernel(act_gate, d); - act_cand_d_ = GetActKernel(act_cand, d); - act_cell_d_ = GetActKernel(act_cell, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - vadd_d_ = KernelPool::Instance().template Get>(d); + static inline std::string name(const lstm_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); + this->ComputeCtHt = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096)); + this->ComputeC1H1 = + jitcode1_->getCode(); + return; + } +#endif - void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, - T* checked) const override { - // gates: W_ch, W_ih, W_fh, W_oh - act_gate_d3_->Compute(gates + d_, gates + d_, d3_); - - /* C_t = C_t-1 * fgated + cand_gated * igated */ - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, gates + d_, d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); - - /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } - void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { - /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_, d_); - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, ct, d_); - /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); + this->ComputeCtHt = refer::LSTMCtHt; + this->ComputeC1H1 = refer::LSTMC1H1; } +#ifdef PADDLE_WITH_XBYAK + private: - int d_, d2_, d3_; - std::shared_ptr> act_gate_d3_, act_gate_d_, act_cand_d_, - act_cell_d_; - std::shared_ptr> vmul_d_; - std::shared_ptr> vadd_d_; -#ifdef __AVX__ - std::unique_ptr avx_act_gate_, avx_act_cand_, avx_act_cell_; + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}; #endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - LSTMKernelImpl::LSTMKernelImpl( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d) \ - : LSTMKernel() { \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_cand_ = GetAVXAct(act_cand); \ - avx_act_cell_ = GetAVXAct(act_cell); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeCtHt( \ - float* gates, const float* ct_1, float* ct, float* ht, \ - const float* wp_data, float* checked) const { \ - /* gates: W_ch, W_ih, W_fh, W_oh */ \ - __m256 c, i, f, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - f = _mm256_loadu_ps(gates + 16); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ - i = _mm256_loadu_ps(ct_1); \ - f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ - f = _mm256_add_ps(c, f); \ - _mm256_storeu_ps(ct, f); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeC1H1( \ - float* gates, float* ct, float* ht, const float* wp_data) const { \ - __m256 c, i, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = igated * cgated*/ \ - c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ - _mm256_storeu_ps(ct, c); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } - -// TODO(TJ): optimize keq16 - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); +#ifdef PADDLE_WITH_XBYAK +template <> +bool LSTMKernelImpl::useJIT(int d) { + return false; // not ready yet gen::LSTMJitCode::init(d); +} #endif /* Peephole JitKernel */ -template +template class PeepholeKernelImpl : public LSTMKernel { public: - explicit PeepholeKernelImpl(const std::string& act_gate, - const std::string& act_cand, - const std::string& act_cell, int d) - : LSTMKernel() { - d_ = d; - d2_ = d * 2; - d3_ = d * 3; - act_gate_d_ = GetActKernel(act_gate, d); - act_cand_d_ = GetActKernel(act_cand, d); - act_cell_d_ = GetActKernel(act_cell, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - vadd_d_ = KernelPool::Instance().template Get>(d); - vadd_d2_ = KernelPool::Instance().template Get>(d2_); - act_gate_d2_ = GetActKernel(act_gate, d2_); + static inline std::string name(const lstm_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); + this->ComputeCtHt = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096)); + this->ComputeC1H1 = + jitcode1_->getCode(); + return; + } +#endif - void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, - T* checked) const override { - /* get fgated and igated*/ - vmul_d_->Compute(wp_data, ct_1, checked, d_); - vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); - vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_); - act_gate_d2_->Compute(gates + d_, gates + d_, d2_); - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, gates + d_, d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); - /* get ogated*/ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); + this->ComputeCtHt = refer::LSTMCtHt; + this->ComputeC1H1 = refer::LSTMC1H1; } - void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { - /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_, d_); - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, ct, d_); - /* get outgated, put W_oc * C_t on igated */ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } +#ifdef PADDLE_WITH_XBYAK private: - int d_, d2_, d3_; - std::shared_ptr> act_gate_d2_, act_gate_d_, act_cand_d_, - act_cell_d_; - std::shared_ptr> vmul_d_; - std::shared_ptr> vadd_d_, vadd_d2_; + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}; +#endif }; -#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> \ - KernelPool::Get, const std::string&, \ - const std::string&, const std::string&, int, bool>( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d, bool use_peephole) - -#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \ - #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \ - (use_peephole ? "p" : "n") - -#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \ - if (use_peephole) { \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>( \ - act_gate, act_cand, act_cell, d)); \ - } else { \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(act_gate, act_cand, \ - act_cell, d)); \ +#ifdef PADDLE_WITH_XBYAK +template <> +bool PeepholeKernelImpl::useJIT(int d) { + return false; // peephole jitcode not ready yet +} +#endif + +#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(const lstm_attr_t& attr) { \ + std::string key(#ker_key "f"); \ + key += (attr.act_gate + attr.act_cand + attr.act_cell + \ + (attr.use_peephole ? "p" : "n")); \ + if (useJIT(attr.d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(attr.d); \ + } else if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(const lstm_attr_t& attr) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ } -REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, - JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); +#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const lstm_attr_t&>( \ + const lstm_attr_t& attr) + +#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(attr) + +#define JITKERNEL_LSTM_IMPL(ker, dtype) \ + if (attr.use_peephole) { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); \ + } else { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); \ + } -#undef INTRI8_FLOAT -#undef JITKERNEL_DECLARE_LSTM -#undef JITKERNEL_KEY_LSTM -#undef JITKERNEL_NEW_LSTM_IMPL +REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM, + JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM, + JITKERNEL_LSTM_IMPL); /* GRU JitKernel */ template diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index a1705a81c4..1cbe1b5d95 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -341,11 +341,11 @@ TEST(JitKernel, lstm) { RandomVec(d, ct_1.data(), -2.f, 2.f); memcpy(xref.data(), x.data(), sizeof(float) * d4); std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false); const auto& ker = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, d, false); + .template Get, const jit::lstm_attr_t&>( + attr); // below kernels are used to compute refer const auto& vsigmoid_3d = jit::KernelPool::Instance().template Get>( @@ -366,14 +366,16 @@ TEST(JitKernel, lstm) { float* ht_ref_data = ht_ref.data(); // compute once to check correctness jit::lstm_t step; - jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell); step.gates = xref_data; step.ct_1 = ct_1_data; step.ct = ct_ref_data; step.ht = ht_ref_data; refer::LSTMCtHt(&step, &attr); - ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + step.gates = x_data; + step.ct = ct_tgt_data; + step.ht = ht_tgt_data; + ker->ComputeCtHt(&step, &attr); for (int i = 0; i < d; ++i) { EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3); @@ -392,7 +394,7 @@ TEST(JitKernel, lstm) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + ker->ComputeCtHt(&step, &attr); } auto ttgte = GetCurrentUS(); VLOG(30) << "Vec size " << d @@ -710,21 +712,21 @@ TEST(JitKernel, pool) { namespace jit = paddle::operators::math::jitkernel; const int frame_size = 4; std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); + const auto& plstm1 = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, false); + .template Get, const jit::lstm_attr_t&>(attr); + const auto& plstm2 = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, false); + .template Get, const jit::lstm_attr_t&>(attr); + EXPECT_EQ(plstm1, plstm2); + const auto& peephole = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, true); + .template Get, const jit::lstm_attr_t&>( + jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true)); EXPECT_TRUE(plstm1 != peephole); const auto& pvmul_f = From 94de2290f4cbc6df0050d06a2d381ba941eb78d1 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Wed, 21 Nov 2018 08:58:04 +0000 Subject: [PATCH 041/113] fix format in api doc, test=develop --- python/paddle/fluid/layers/nn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5749bcc54a..be120e45d2 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6904,13 +6904,13 @@ def prelu(x, mode, param_attr=None, name=None): Args: x (Variable): The input tensor. - param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight (alpha). mode (string): The mode for weight sharing. It supports all, channel and element. all: all elements share same weight channel:elements in a channel share same weight element:each element has a weight - name(str|None): A name for this layer(optional). If set None, the layer + name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. Returns: @@ -6920,9 +6920,9 @@ def prelu(x, mode, param_attr=None, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") - mode = 'channel' - output = fluid.layers.prelu(x,mode) + x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") + mode = 'channel' + output = fluid.layers.prelu(x,mode) """ helper = LayerHelper('prelu', **locals()) if mode not in ['all', 'channel', 'element']: From f10e196fc8de5d76333940a263dabf33f0450fa5 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 18:09:44 +0800 Subject: [PATCH 042/113] fix build issue --- paddle/fluid/inference/tensorrt/convert/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 27fb41d16e..840abd26a7 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -18,7 +18,7 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL) nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin SERIAL) nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin elementwise_add_op elementwise_mul_op SERIAL) From 3e3599f3d937e0444606056f3c9f2261b74dfd93 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 21 Nov 2018 11:31:04 +0000 Subject: [PATCH 043/113] Refine split tensorrt plugin --- .../inference/tensorrt/convert/split_op.cc | 3 +- .../tensorrt/plugin/split_op_plugin.cu | 157 ++++++++++++++---- .../tensorrt/plugin/split_op_plugin.h | 9 +- 3 files changed, 134 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 6620c76318..871354267e 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -40,7 +40,7 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); - PADDLE_ENFORCE(axis != 0); + // PADDLE_ENFORCE(axis != 0); if (axis < 0) { axis += input_dims.nbDims; } else { @@ -48,7 +48,6 @@ class SplitOpConverter : public OpConverter { } PADDLE_ENFORCE(output_lengths.size() == output_num); - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 4adea2db1e..1ec0753e9f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { @@ -19,6 +21,52 @@ namespace inference { namespace tensorrt { namespace plugin { +// copied from operators::math::SplitFunctor +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + nvinfer1::Dims SplitPlugin::getOutputDimensions( int index, const nvinfer1::Dims* input_dims, int num_inputs) { PADDLE_ENFORCE_EQ(num_inputs, 1); @@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions( int SplitPlugin::initialize() { PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS); - + // notice input dims is [C, H, W] + nvinfer1::Dims dims = this->getInputDims(0); + outer_rows_ = 1; + inner_cols_ = 1; + for (int i = 0; i < axis_; ++i) { + outer_rows_ *= dims.d[i]; + } + for (int i = axis_ + 1; i < dims.nbDims; ++i) { + inner_cols_ *= dims.d[i]; + } + same_shape_ = true; std::vector segment_offsets(1, 0); for (int i = 0; i < this->getNbOutputs(); ++i) { - segment_offsets.push_back(segment_offsets.back() + output_length_[i]); + if (output_length_[i] != output_length_[0]) { + same_shape_ = false; + } + segment_offsets.push_back(segment_offsets.back() + + output_length_[i] * inner_cols_); } - segment_offsets_ = segment_offsets; - nvinfer1::Dims dims = this->getInputDims(0); - nx_ = 1; - for (int i = dims.nbDims - 1; i > axis_; --i) { - nx_ *= dims.d[i]; + inner_cols_ *= dims.d[axis_]; + d_segment_offsets_ = segment_offsets; + segment_offsets_ = std::move(segment_offsets); + d_output_ptrs_.resize(this->getNbOutputs(), nullptr); + return 0; +} + +template +inline void Split(cudaStream_t stream, const bool same_shape, + const int outer_rows, const int inner_cols, + const std::vector& segment_offsets, + const int* d_segment_offsets, const T* input, T** outputs) { + const int kThreadsPerBlock = 1024; + const int kMaxBlocks = 65535; + int block_cols = kThreadsPerBlock; + if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((inner_cols + 31) >> 5) << 5; } - ny_ = dims.d[axis_]; - nz_ = 1; - for (int i = axis_ - 1; i >= 0; --i) { - nz_ *= dims.d[i]; + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int grid_cols = + std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); + int grid_rows = + std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (same_shape) { + SplitKernel<<>>( + input, outer_rows, inner_cols, segment_offsets[1], outputs); + } else { + SplitKernel<<>>( + input, outer_rows, inner_cols, d_segment_offsets, + static_cast(segment_offsets.size()), outputs); } - return 0; } int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { - auto const& input_dims = this->getInputDims(0); - int input_size = 0; - float const* idata = reinterpret_cast(inputs[0]); - float** odatas = reinterpret_cast(outputs); - - // kernel impl here. - int inputBatchOffset = nx_ * ny_ * nz_; - for (size_t i = 0; i < this->getNbOutputs(); i++) { - for (size_t j = 0; j < batchSize; j++) { - cudaMemcpyAsync( - odatas[i] + - j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * - sizeof(float), - inputs[0] + - (inputBatchOffset * j + segment_offsets_[i] * nx_) * - sizeof(float), - (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float), - cudaMemcpyDeviceToDevice, stream); + float const* input_ptr = reinterpret_cast(inputs[0]); + if (axis_ == -1 && this->getNbOutputs() < 10) { + float** output_ptrs = reinterpret_cast(outputs); + int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) + ? sizeof(__half) + : sizeof(float); + for (int i = 0; i < this->getNbOutputs(); ++i) { + PADDLE_ENFORCE( + cudaMemcpyAsync( + output_ptrs[i], input_ptr + segment_offsets_[i], + (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, + cudaMemcpyDeviceToDevice, stream) == cudaSuccess); + } + } else { + outer_rows_ *= batchSize; + const int* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); + float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); + PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs, + this->getNbOutputs() * sizeof(float*), + cudaMemcpyHostToDevice, + stream) == cudaSuccess); + if (this->getDataType() == nvinfer1::DataType::kFLOAT) { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, input_ptr, output_ptrs); + } else { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT + (__half**)output_ptrs); // NOLINT } } - return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index b5b6e69992..6f028d3d72 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,7 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: SplitPlugin(int axis, std::vector const &output_lengths) - : axis_(axis), output_length_(output_lengths) {} + : axis_(axis), same_shape_(true), output_length_(output_lengths) {} SplitPlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); @@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT { } int axis_; + int outer_rows_; + int inner_cols_; + bool same_shape_; std::vector output_length_; - int nx_, ny_, nz_; std::vector segment_offsets_; + thrust::device_vector d_segment_offsets_; + thrust::device_vector d_output_ptrs_; }; } // namespace plugin From 35620513023000ceb47ec0b57909dae4f0634355 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 21 Nov 2018 12:37:28 +0000 Subject: [PATCH 044/113] add gru refer code and remove redundant avx code test=develop --- paddle/fluid/operators/fused/fusion_gru_op.cc | 67 ++-- paddle/fluid/operators/math/jit_kernel.h | 8 +- paddle/fluid/operators/math/jit_kernel_exp.cc | 152 --------- paddle/fluid/operators/math/jit_kernel_impl.h | 30 +- .../fluid/operators/math/jit_kernel_refer.h | 40 +++ paddle/fluid/operators/math/jit_kernel_rnn.cc | 294 ++++-------------- 6 files changed, 163 insertions(+), 428 deletions(-) diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 7e34d1019c..25b7ae7c28 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -183,24 +183,27 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const auto& ker = math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const std::string&, const std::string&>( \ - ctx.Attr("gate_activation"), \ - ctx.Attr("activation"), D); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const math::jitkernel::gru_attr_t attr( \ + D, ctx.Attr("gate_activation"), \ + ctx.Attr("activation")); \ + math::jitkernel::gru_t one_step; \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const math::jitkernel::gru_attr_t&>(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { @@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel { if (h0_data) { prev_hidden_data = h0_data + bid * D; } else { - ker->ComputeH1(xx_data, hidden_out_data); + one_step.gates = xx_data; + one_step.ht = hidden_out_data; + ker->ComputeH1(&one_step, &attr); prev_hidden_data = hidden_out_data; tstart = 1; move_step(); @@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel { blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast(1), prev_hidden_data, D, wh_data, D2, static_cast(1), xx_data, D3); - ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data); + one_step.gates = xx_data; + one_step.ht_1 = prev_hidden_data; + one_step.ht = hidden_out_data; + ker->ComputeHtPart1(&one_step, &attr); // gemm rt * Ws blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), hidden_out_data, D, wh_state_data, D, static_cast(1), xx_data + D2, D3); - ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data); + ker->ComputeHtPart2(&one_step, &attr); // save prev prev_hidden_data = hidden_out_data; move_step(); @@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; // W: {W_update, W_reset; W_state} for (int i = 0; i < max_bs; ++i) { - ker->ComputeH1(cur_in_data, cur_out_data); + one_step.gates = cur_in_data; + one_step.ht = cur_out_data; + ker->ComputeH1(&one_step, &attr); // add offset cur_in_data += D3; cur_out_data += D; @@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; T* cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data, - cur_out_data); + one_step.gates = cur_batched_data; + one_step.ht_1 = cur_prev_hidden_data; + one_step.ht = cur_out_data; + ker->ComputeHtPart1(&one_step, &attr); + cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; @@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel { cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data, - cur_out_data); + one_step.gates = cur_batched_data; + one_step.ht_1 = cur_prev_hidden_data; + one_step.ht = cur_out_data; + ker->ComputeHtPart2(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index bb5ba5813a..b78b92b4f9 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -122,18 +122,18 @@ class VTanhKernel : public VActKernel {}; template class LSTMKernel : public Kernel { public: - void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); // compute c1 and h1 without c0 or h0 void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *); + void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); }; template class GRUKernel : public Kernel { public: // compute h1 without h0 - virtual void ComputeH1(T *gates, T *ht) const = 0; - virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0; - virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0; + void (*ComputeH1)(gru_t *, const gru_attr_t *); + void (*ComputeHtPart1)(gru_t *, const gru_attr_t *); + void (*ComputeHtPart2)(gru_t *, const gru_attr_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 1fe7d66c75..686f3dd983 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -25,10 +25,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/mklml.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { @@ -235,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel); REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); REGISTER_JITKERNEL(vtanh, VTanhKernel); -namespace detail { - -#ifdef __AVX__ - -#define ALIGN32 __attribute__((aligned(32))) - -#define _PS256_CONST(Name, Val) \ - static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ - Val, Val, Val, Val} - -#define _PI256_CONST(Name, Val) \ - static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ - Val, Val, Val, Val} - -_PI256_CONST(0x7f, 0x7f); -_PS256_CONST(one, 1.f); -_PS256_CONST(0p5, 0.5f); -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS256_CONST(cephes_exp_C1, 0.693359375); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4); -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); - -typedef union imm_xmm_union { - __m256i imm; - __m128i xmm[2]; -} imm_xmm_union; - -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ - { \ - imm_xmm_union u ALIGN32; \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ - } - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ - { \ - imm_xmm_union u ALIGN32; \ - u.xmm[0] = xmm0_; \ - u.xmm[1] = xmm1_; \ - imm_ = u.imm; \ - } - -#define AVX2_BITOP_USING_SSE2(fn) \ - static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \ - /* use SSE2 to perform the bitop AVX2 */ \ - __m128i x1, x2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1, y); \ - x2 = _mm_##fn(x2, y); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return ret; \ - } - -#define AVX2_INTOP_USING_SSE2(fn) \ - static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \ - /* use SSE2 to perform the AVX2 integer operation */ \ - __m128i x1, x2; \ - __m128i y1, y2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1, y1); \ - x2 = _mm_##fn(x2, y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return ret; \ - } - -AVX2_BITOP_USING_SSE2(slli_epi32); -AVX2_INTOP_USING_SSE2(add_epi32); - -#define AVXEXP_BASE \ - __m256 tmp = _mm256_setzero_ps(), fx; \ - __m256 one = *reinterpret_cast(_ps256_one); \ - __m256i imm0; \ - x = _mm256_min_ps(x, *reinterpret_cast(_ps256_exp_hi)); \ - x = _mm256_max_ps(x, *reinterpret_cast(_ps256_exp_lo)); \ - /* express exp(x) as exp(g + n*log(2)) */ \ - fx = _mm256_mul_ps(x, \ - *reinterpret_cast(_ps256_cephes_LOG2EF)); \ - fx = _mm256_add_ps(fx, *reinterpret_cast(_ps256_0p5)); \ - tmp = _mm256_floor_ps(fx); \ - /* if greater, substract 1 */ \ - __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); \ - mask = _mm256_and_ps(mask, one); \ - fx = _mm256_sub_ps(tmp, mask); \ - tmp = _mm256_mul_ps(fx, \ - *reinterpret_cast(_ps256_cephes_exp_C1)); \ - __m256 z = _mm256_mul_ps( \ - fx, *reinterpret_cast(_ps256_cephes_exp_C2)); \ - x = _mm256_sub_ps(x, tmp); \ - x = _mm256_sub_ps(x, z); \ - z = _mm256_mul_ps(x, x); \ - __m256 y = *reinterpret_cast(_ps256_cephes_exp_p0); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p1)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p2)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p3)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p4)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p5)); \ - y = _mm256_mul_ps(y, z); \ - y = _mm256_add_ps(y, x); \ - y = _mm256_add_ps(y, one); \ - /* build 2^n */ \ - imm0 = _mm256_cvttps_epi32(fx) - -__m256 ExpAVX(__m256 x) { - AVXEXP_BASE; - // two AVX2 instructions using SSE2 - imm0 = avx2_mm256_add_epi32(imm0, - *reinterpret_cast(_pi256_0x7f)); - imm0 = avx2_mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} -#endif - -#ifdef __AVX2__ -__m256 ExpAVX2(__m256 x) { - AVXEXP_BASE; - // two AVX2 instructions - imm0 = _mm256_add_epi32(imm0, *reinterpret_cast(_pi256_0x7f)); - imm0 = _mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} -#endif - -} // namespace detail } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index 2e734ca940..ba5f20e533 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -38,20 +38,34 @@ typedef struct { void* checked{nullptr}; } lstm_t; -typedef struct lstm_attr_s { - bool use_peephole; +typedef struct { + void* gates; // gates: {W_update, W_reset; W_state} + const void* ht_1; + void* ht; +} gru_t; + +struct rnn_attr_s { int d; - std::string act_gate, act_cand, act_cell; + std::string act_gate, act_cand; + rnn_attr_s() = default; + rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand) + : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} +}; + +struct lstm_attr_s : public rnn_attr_s { + bool use_peephole; + std::string act_cell; lstm_attr_s() = default; lstm_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand, const std::string& _act_cell, bool _use_peephole = false) - : use_peephole(_use_peephole), - d(_d), - act_gate(_act_gate), - act_cand(_act_cand), + : rnn_attr_s(_d, _act_gate, _act_cand), + use_peephole(_use_peephole), act_cell(_act_cell) {} -} lstm_attr_t; +}; + +typedef struct rnn_attr_s gru_attr_t; +typedef struct lstm_attr_s lstm_attr_t; } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 097bb85956..2e1a7f22db 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -185,6 +185,46 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { VMul(gates + d2, gates + d3, ht, d); } +// compute h1 without h0 +template +void GRUH1(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + int d2 = d * 2; + act_gate(gates, gates, d); + act_cand(gates + d2, gates + d2, d); + VMul(gates, gates + d2, ht, d); +} + +template +void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { + // W: {W_update, W_reset; W_state} + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + act_gate(gates, gates, attr->d * 2); + VMul(ht_1, gates + attr->d, ht, attr->d); +} + +template +void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + T* y = gates + d * 2; + act_cand(y, y, d); + // out = zt*ht~ + (1-zt)*ht_1 + for (int i = 0; i < d; ++i) { + ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + } +} + } // namespace refer } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index 6b7463aa52..dbfd212e6e 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -23,140 +23,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_code.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace detail { -#ifdef __AVX__ -__m256 ExpAVX(__m256 x); -#endif - -#ifdef __AVX2__ -__m256 ExpAVX2(__m256 x); -#endif - -} // namespace detail - -namespace jit = platform::jit; - -#ifdef __AVX__ -typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type; - -class AVXAct { - public: - virtual ~AVXAct() = default; - virtual __m256 Compute(__m256 x) const = 0; -}; - -template -class AVXActImpl : public AVXAct { - public: - __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } -}; - -#define AVX_SIGMOID(isa, expisa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - __m256 ones = _mm256_set1_ps(1.0f); \ - x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \ - x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \ - x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); \ - x = expisa(x); \ - x = _mm256_add_ps(ones, x); \ - return _mm256_div_ps(ones, x); \ - } - -#define AVX_TANH(isa, expisa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - __m256 ones = _mm256_set1_ps(1.0f); \ - x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); \ - x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); \ - x = expisa(x); \ - x = _mm256_add_ps(ones, x); \ - x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); \ - return _mm256_sub_ps(x, ones); \ - } - -#define AVX_RELU(isa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - return _mm256_max_ps(x, _mm256_setzero_ps()); \ - } - -#define AVX_IDENTITY(isa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - return x; \ - } - -#define FOR_EACH_AVX_ISA(macro_) \ - macro_(jit::avx); \ - macro_(jit::avx2); \ - macro_(jit::avx512f) - -FOR_EACH_AVX_ISA(AVX_RELU); -FOR_EACH_AVX_ISA(AVX_IDENTITY); - -AVX_SIGMOID(jit::avx, detail::ExpAVX); -AVX_TANH(jit::avx, detail::ExpAVX); - -#ifdef __AVX2__ -AVX_SIGMOID(jit::avx2, detail::ExpAVX2); -AVX_SIGMOID(jit::avx512f, detail::ExpAVX2); -AVX_TANH(jit::avx2, detail::ExpAVX2); -AVX_TANH(jit::avx512f, detail::ExpAVX2); -#endif - -#undef FOR_EACH_AVX_ISA -#undef AVX_IDENTITY -#undef AVX_RELU -#undef AVX_TANH -#undef AVX_SIGMOID - -#endif - -template -static std::shared_ptr> GetActKernel( - const std::string& type, int n) { - if (type == "sigmoid") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "relu") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "tanh") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "identity" || type == "") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} - -#ifdef __AVX__ -template -static std::unique_ptr GetAVXAct(const std::string& type) { - if (type == "sigmoid") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "relu") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "tanh") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "identity" || type == "") { - return std::unique_ptr(new AVXActImpl()); - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} -#endif /* LSTM JitKernel */ template @@ -290,125 +160,73 @@ REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM, JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM, JITKERNEL_LSTM_IMPL); +#undef JITKERNEL_LSTM_IMPL +#undef JITKERNEL_FIND_KEY_LSTM +#undef JITKERNEL_DECLARE_LSTM +#undef JITKERNEL_DEFINE_NAME_LSTM + /* GRU JitKernel */ -template +template class GRUKernelImpl : public GRUKernel { public: - explicit GRUKernelImpl(const std::string& act_gate, - const std::string& act_state, int d) - : GRUKernel() { - d_ = d; - d2_ = d * 2; - act_gate_d2_ = GetActKernel(act_gate, d2_); - act_gate_d_ = GetActKernel(act_gate, d); - act_state_d_ = GetActKernel(act_state, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - } - - void ComputeH1(T* gates, T* ht) const override { - act_gate_d_->Compute(gates, gates, d_); - act_state_d_->Compute(gates + d2_, gates + d2_, d_); - vmul_d_->Compute(gates, gates + d2_, ht, d_); - } - - void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { - // W: {W_update, W_reset; W_state} - act_gate_d2_->Compute(gates, gates, d2_); - vmul_d_->Compute(ht_1, gates + d_, ht, d_); + static inline std::string name(const gru_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } - - void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { - T* y = gates + d2_; - act_state_d_->Compute(y, y, d_); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d_; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; - } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { + this->ComputeH1 = refer::GRUH1; + this->ComputeHtPart1 = refer::GRUHtPart1; + this->ComputeHtPart2 = refer::GRUHtPart2; } - - private: - int d_, d2_; - std::shared_ptr> act_gate_d2_, act_gate_d_, act_state_d_; - std::shared_ptr> vmul_d_; -#ifdef __AVX__ - std::unique_ptr avx_act_gate_, avx_act_state_; -#endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - GRUKernelImpl::GRUKernelImpl( \ - const std::string& act_gate, const std::string& act_state, int d) \ - : GRUKernel() { \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_state_ = GetAVXAct(act_state); \ - } \ - template <> \ - void GRUKernelImpl::ComputeH1(float* gates, float* ht) \ - const { \ - __m256 u, s; \ - /* W: {W_update, W_reset; W_state} */ \ - u = _mm256_loadu_ps(gates); \ - s = _mm256_loadu_ps(gates + 16); \ - s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \ - _mm256_storeu_ps(ht, s); \ - } \ - template <> \ - void GRUKernelImpl::ComputeHtPart1( \ - float* gates, const float* ht_1, float* ht) const { \ - /* not exactly equal the any implementation */ \ - __m256 r, ht0; \ - r = _mm256_loadu_ps(gates + 8); \ - ht0 = _mm256_loadu_ps(ht_1); \ - r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \ - _mm256_storeu_ps(ht, r); \ - } \ - template <> \ - void GRUKernelImpl::ComputeHtPart2( \ - float* gates, const float* ht_1, float* ht) const { \ - /* not exactly equal the any implementation */ \ - __m256 u, s, ht0; \ - u = _mm256_loadu_ps(gates); \ - s = _mm256_loadu_ps(gates + 16); \ - ht0 = _mm256_loadu_ps(ht_1); \ - u = avx_act_gate_->Compute(u); \ - s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \ - u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \ - u = _mm256_mul_ps(u, ht0); \ - u = _mm256_add_ps(s, u); \ - _mm256_storeu_ps(ht, u); \ - } - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); -#endif - -#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> KernelPool::Get< \ - GRUKernel, const std::string&, const std::string&, int>( \ - const std::string& act_gate, const std::string& act_state, int d) - -#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \ - #ker_key #dtype_key + std::to_string(d) + act_gate + act_state +#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(const gru_attr_t& attr) { \ + std::string key(#ker_key "f"); \ + key += (attr.act_gate + attr.act_cand); \ + if (useJIT(attr.d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(attr.d); \ + } else if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(const gru_attr_t& attr) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } + +#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const gru_attr_t&>( \ + const gru_attr_t& attr) + +#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(attr) -#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(act_gate, act_state, d)); +#define JITKERNEL_GRU_IMPL(ker, dtype) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); -REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU, - JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); +REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU, + JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU, + JITKERNEL_GRU_IMPL); -#undef INTRI8_FLOAT -#undef JITKERNEL_NEW_GRU_IMPL -#undef JITKERNEL_KEY_GRU +#undef JITKERNEL_GRU_IMPL +#undef JITKERNEL_FIND_KEY_GRU #undef JITKERNEL_DECLARE_GRU +#undef JITKERNEL_DEFINE_NAME_GRU } // namespace jitkernel } // namespace math } // namespace operators From 6193dc76368f5f888d8270f938ec81b78e06ffdd Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 23:17:26 +0800 Subject: [PATCH 045/113] test=develop --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5325e3034c..bc2ac2cd93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,10 @@ if (WIN32) "Disable MKL when compiling for Windows" FORCE) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) + set(WITH_C_API OFF CACHE STRING + "Disable C_API when compiling for Windows" FORCE) + set(WITH_FLUID_ONLY ON CACHE STRING + "Enable FLUID_ONLY when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING From 6eba5bd276a8d79d5611ec42db0c47273fb4950c Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 21 Nov 2018 15:32:25 +0000 Subject: [PATCH 046/113] Fix direct copy and refine split ut test=develop --- .../tensorrt/convert/test_split_op.cc | 55 ++++++++++++++----- .../tensorrt/plugin/split_op_plugin.cu | 7 ++- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index f81d011552..23909378dd 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -20,30 +20,59 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(split_op, test) { +template +void TensorRTSplitTest(const std::vector &in_shape, + const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2)); - validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2)); - validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2)); + TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000); + + auto make_dim = [](const std::vector &shape) { + nvinfer1::DimsCHW dim; + dim.c() = shape[0]; + dim.h() = shape[1]; + dim.w() = shape[2]; + return dim; + }; + validator.DeclInputVar("split_input", make_dim(in_shape)); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis - 1] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, make_dim(out_shape)); + output_vars.push_back(output_name); + } // Prepare Op description framework::OpDesc desc; desc.SetType("split"); desc.SetInput("X", {"split_input"}); - desc.SetOutput("Out", {"split_out1", "split_out2"}); + desc.SetOutput("Out", output_vars); - int num = 0; - int axis = 1; - std::vector output_lengths = {2, 1}; - desc.SetAttr("axis", axis); - desc.SetAttr("num", num); - desc.SetAttr("sections", output_lengths); + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); validator.SetOp(*desc.Proto()); - validator.Execute(1); + validator.Execute(BatchSize); +} + +TEST(split_op, test_same_shape_batch1) { + TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_batch1) { + TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); +} + +TEST(split_op, test_same_shape_batch10) { + TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_batch10) { + TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 1ec0753e9f..de61ace59e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -138,11 +138,12 @@ inline void Split(cudaStream_t stream, const bool same_shape, int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { float const* input_ptr = reinterpret_cast(inputs[0]); - if (axis_ == -1 && this->getNbOutputs() < 10) { + if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && + this->getNbOutputs() < 10) { float** output_ptrs = reinterpret_cast(outputs); int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) - ? sizeof(__half) - : sizeof(float); + ? sizeof(float) + : sizeof(__half); for (int i = 0; i < this->getNbOutputs(); ++i) { PADDLE_ENFORCE( cudaMemcpyAsync( From 3912545ffec3ea5a850420f0a804afadc9f0352a Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 22 Nov 2018 04:30:19 +0000 Subject: [PATCH 047/113] add dlpack support test=develop --- CMakeLists.txt | 1 + cmake/external/dlpack.cmake | 31 +++++ paddle/fluid/framework/CMakeLists.txt | 3 + paddle/fluid/framework/dlpack_tensor.cc | 127 +++++++++++++++++++ paddle/fluid/framework/dlpack_tensor.h | 45 +++++++ paddle/fluid/framework/dlpack_tensor_test.cc | 113 +++++++++++++++++ 6 files changed, 320 insertions(+) create mode 100644 cmake/external/dlpack.cmake create mode 100644 paddle/fluid/framework/dlpack_tensor.cc create mode 100644 paddle/fluid/framework/dlpack_tensor.h create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index c62cc9bfd7..b6ae241272 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,7 @@ include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) include(external/xxhash) # download xxhash +include(external/dlpack) if (NOT WIN32) # there is no official support of snappystream, warpctc, nccl, cupti in windows diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake new file mode 100644 index 0000000000..94d8fcc668 --- /dev/null +++ b/cmake/external/dlpack.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack) +set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include) + +include_directories(${DLPACK_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dlpack + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/dmlc/dlpack.git" + GIT_TAG "v0.2" + PREFIX ${DLPACK_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(dlpack STATIC ${dummyfile}) +else() + add_library(dlpack INTERFACE) +endif() + +add_dependencies(dlpack extern_dlpack) + +LIST(APPEND externl_project_dependencies dlpack) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cb9057672c..d7d7834b49 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -205,3 +205,6 @@ cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) + +cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) +cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc new file mode 100644 index 0000000000..04e3f78afe --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" + +namespace paddle { +namespace framework { + +namespace internal { +template +static ::DLDataType GetDLDataTypeCode() { + ::DLDataType dtype; + if (std::is_same::value || + std::is_floating_point::value) { + dtype.code = kDLFloat; + } else if (std::is_unsigned::value) { + dtype.code = kDLUInt; + } else if (std::is_integral::value) { + dtype.code = kDLInt; + } else { + PADDLE_THROW("Unsupported data type %s", typeid(T).name()); + } + dtype.bits = 8 * sizeof(T); + dtype.lanes = 1; + return dtype; +} + +static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { +#define REG_DL_DATA_TYPE(type) \ + { std::type_index(typeid(type)), GetDLDataTypeCode() } + static const std::unordered_map + type_to_dtype_map({ + REG_DL_DATA_TYPE(platform::float16), // NOLINT + REG_DL_DATA_TYPE(float), // NOLINT + REG_DL_DATA_TYPE(double), // NOLINT + REG_DL_DATA_TYPE(int), // NOLINT + REG_DL_DATA_TYPE(int64_t), // NOLINT + REG_DL_DATA_TYPE(bool), // NOLINT + REG_DL_DATA_TYPE(size_t), // NOLINT + REG_DL_DATA_TYPE(int16_t), // NOLINT + REG_DL_DATA_TYPE(uint8_t), // NOLINT + REG_DL_DATA_TYPE(int8_t) // NOLINT + }); + static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); + auto it = type_to_dtype_map.find(type); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", + type.name()); + return it->second; +#undef REG_DL_DATA_TYPE +} + +struct DLContextVisitor : public boost::static_visitor<::DLContext> { + inline ::DLContext operator()(const platform::CPUPlace &place) const { + DLContext ctx; + ctx.device_type = kDLCPU; + ctx.device_id = 0; + return ctx; + } + + inline ::DLContext operator()(const platform::CUDAPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLGPU; + ctx.device_id = place.device; + return ctx; +#else + PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version"); +#endif + } + + inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLCPUPinned; + ctx.device_id = 0; + return ctx; +#else + PADDLE_THROW( + "platform::CUDAPinnedPlace is not supported in CPU only version"); +#endif + } +}; +} // namespace internal + +DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { + // init data, data buffer + t_.data = const_cast(tensor.data()); + + // init ctx, DLContext type with device_type and device_id + auto place = tensor.place(); + t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + + // init dtype + t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); + t_.dtype.lanes = lanes; + + // init ndim, tensor rank + auto &dims = tensor.dims(); + using DimType = decltype(t_.ndim); // int + t_.ndim = static_cast(dims.size()); + + // init shape, tensor dims + t_.shape = shape_; + for (DimType i = 0; i < t_.ndim; ++i) { + t_.shape[i] = dims[i]; + } + + // init strides, nullptr means the tensor is compact + t_.strides = nullptr; + + // init byte_offset + t_.byte_offset = 0; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h new file mode 100644 index 0000000000..0c52bce1ef --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class DLPackTensor { + public: + using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t + using ShapeType = + std::remove_reference::type; // int64_t + + // lanes is only used in CPU to enable vectorization + explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1); + + inline operator const ::DLTensor&() const { return t_; } + + inline operator ::DLTensor&() { return t_; } + + private: + ::DLTensor t_; + + // The shape in DLTensor is defined as int64_t* + // Add this member to make TVMTensor init without heap allocation + ShapeType shape_[9]; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc new file mode 100644 index 0000000000..938b056350 --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +namespace { // NOLINT +template +constexpr uint8_t GetDLDataTypeCode() { + return std::is_same::value || + std::is_floating_point::value + ? static_cast(kDLFloat) + : (std::is_unsigned::value + ? static_cast(kDLUInt) + : (std::is_integral::value ? static_cast(kDLInt) + : static_cast(-1))); +} +} // NOLINT + +template +void TestMain(const platform::Place &place, uint16_t lanes) { + DDim dims{4, 5, 6, 7}; + Tensor tensor; + tensor.Resize(dims); + void *p = tensor.mutable_data(place); + + DLPackTensor dlpack_tensor(tensor, lanes); + ::DLTensor &dl_tensor = dlpack_tensor; + + CHECK_EQ(p, dl_tensor.data); + if (platform::is_cpu_place(place)) { + CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else if (platform::is_gpu_place(place)) { + CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(boost::get(place).device, + dl_tensor.ctx.device_id); + } else if (platform::is_cuda_pinned_place(place)) { + CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else { + CHECK_EQ(false, true); + } + + CHECK_EQ(dims.size(), dl_tensor.ndim); + for (auto i = 0; i < dims.size(); ++i) { + CHECK_EQ(dims[i], dl_tensor.shape[i]); + } + + CHECK_EQ(dl_tensor.strides == nullptr, true); + CHECK_EQ(static_cast(0), dl_tensor.byte_offset); + + CHECK_EQ(lanes, dl_tensor.dtype.lanes); + CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits); + + CHECK_EQ(GetDLDataTypeCode(), dl_tensor.dtype.code); +} + +template +void TestMainLoop() { +#ifdef PADDLE_WITH_CUDA + std::vector places{platform::CPUPlace(), + platform::CUDAPlace(0), + platform::CUDAPinnedPlace()}; + if (platform::GetCUDADeviceCount() > 1) { + places.emplace_back(platform::CUDAPlace(1)); + } +#else + std::vector places{platform::CPUPlace()}; +#endif + std::vector lanes{1, 2}; + for (auto &p : places) { + for (auto &l : lanes) { + TestMain(p, l); + } + } +} + +#define PADDLE_DLPACK_TEST(type) \ + TEST(dlpack, test_##type) { TestMainLoop(); } + +using float16 = platform::float16; +PADDLE_DLPACK_TEST(float16); +PADDLE_DLPACK_TEST(float); +PADDLE_DLPACK_TEST(double); +PADDLE_DLPACK_TEST(int); +PADDLE_DLPACK_TEST(int64_t); +PADDLE_DLPACK_TEST(bool); +PADDLE_DLPACK_TEST(size_t); +PADDLE_DLPACK_TEST(int16_t); +PADDLE_DLPACK_TEST(uint8_t); +PADDLE_DLPACK_TEST(int8_t); + +#undef PADDLE_DLPACK_TEST + +} // namespace framework +} // namespace paddle From 982e48922020e8d5f3ddcfc682068fcbdc5b7fe2 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 06:26:04 +0000 Subject: [PATCH 048/113] test=develop --- python/paddle/fluid/layers/nn.py | 5 +++-- python/paddle/fluid/tests/unittests/test_layers.py | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e308..32d411b830 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2139,8 +2139,9 @@ def pool2d(input, input tensor is NCHW, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - pool_size (int): The side length of pooling windows. All pooling - windows are squares with pool_size on a side. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple, + it must contain two integers, (pool_size_Height, pool_size_Width). + Otherwise, the pool kernel size will be a square of an int. pool_type: ${pooling_type_comment} pool_stride (int): stride of the pooling layer. pool_padding (int): padding size. diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index a8fa5436c4..c4310fe006 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -202,6 +202,12 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_unpad(x=x, length=length)) print(str(program)) + def test_pool2d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') + self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3])) + def test_lstm_unit(self): program = Program() with program_guard(program): From 1adda8e06c075d55edcc6aa50804eab62b903f72 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 22 Nov 2018 06:53:16 +0000 Subject: [PATCH 049/113] Add more unit tests for split plugin test=develop --- .../inference/tensorrt/convert/split_op.cc | 13 ++--- .../tensorrt/convert/test_split_op.cc | 47 ++++++++++++++++--- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 871354267e..ae5b1b9806 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -19,9 +19,6 @@ namespace paddle { namespace inference { namespace tensorrt { -/* - * SplitOp. - */ class SplitOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, @@ -40,15 +37,11 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); - // PADDLE_ENFORCE(axis != 0); - if (axis < 0) { - axis += input_dims.nbDims; - } else { - axis -= 1; - } + // split on batch is not supported in TensorRT + PADDLE_ENFORCE(axis != 0); + axis += (axis < 0) ? input_dims.nbDims : -1; PADDLE_ENFORCE(output_lengths.size() == output_num); - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index 23909378dd..5aacc5c600 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -59,21 +59,54 @@ void TensorRTSplitTest(const std::vector &in_shape, validator.Execute(BatchSize); } -TEST(split_op, test_same_shape_batch1) { +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); } - -TEST(split_op, test_different_shape_batch1) { +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); } - -TEST(split_op, test_same_shape_batch10) { +// batch = 10, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch10) { TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); } - -TEST(split_op, test_different_shape_batch10) { +// batch = 10, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch10) { TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); } +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1}); +} +// batch = 10, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2}); +} +// batch = 10, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1}); +} +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1}); +} +// batch = 10, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2}); +} +// batch = 10, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1}); +} } // namespace tensorrt } // namespace inference From e3b61cf52b88b1350de8776afcfd8e5ae348e164 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 22 Nov 2018 08:24:01 +0000 Subject: [PATCH 050/113] init gru jitcode and fix lstm jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 36 ++++- paddle/fluid/operators/math/jit_code.h | 140 ++++++++++++++---- paddle/fluid/operators/math/jit_kernel_rnn.cc | 36 ++++- 3 files changed, 170 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index ccc9206f5c..03b67238fe 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -214,6 +214,9 @@ void VActJitCode::generate() { bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } void LSTMJitCode::generate() { + if (use_peephole_) { + preCode(); + } reg64_t reg_ptr_gates = rax; reg64_t reg_ptr_ct_1 = r9; reg64_t reg_ptr_ct = r10; @@ -224,18 +227,19 @@ void LSTMJitCode::generate() { mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); int offset = 0; + int d = num_ * sizeof(float); for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { /* C_t = C_t-1 * fgated + cand_gated * igated*/ // c vmovups(ymm_src, ptr[reg_ptr_gates + offset]); act(ymm_c, ymm_src, act_cand_); // i - vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]); act(ymm_i, ymm_src, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); if (!compute_c1h1_) { // f - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]); act(ymm_f, ymm_src, act_gate_); vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); vmulps(ymm_f, ymm_f, ymm_i); @@ -245,20 +249,36 @@ void LSTMJitCode::generate() { ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); // save ct act(ymm_tmp, ymm_ct, act_cell_); - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]); act(ymm_o, ymm_src, act_gate_); vmulps(ymm_o, ymm_tmp, ymm_o); - // save ct and ht - vmovups(ptr[reg_ptr_ct + offset], ymm_ct); - vmovups(ptr[reg_ptr_ht + offset], ymm_o); - + vmovups(ptr[reg_ptr_ht + offset], ymm_o); // save ht offset += sizeof(float) * YMM_FLOAT_BLOCK; } - ret(); + if (use_peephole_) { + postCode(); + } else { + ret(); + } } +bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } + +void GRUJitCode::generate() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + + ret(); +} } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index bf28d444b7..403cea3991 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -302,6 +302,34 @@ class VActJitCode : public JitCode { pop(reg_ptr_global); } + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 15 + JMM zero = JMM(15); + if (type_ == operand_type::relu) { + vxorps(zero, zero, zero); + } + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, zero); + break; + case operand_type::exp: + exp_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::identity: + break; + default: + // throw error + break; + } + } + protected: int num_; operand_type type_; @@ -386,44 +414,94 @@ class LSTMJitCode : public VActJitCode { operand_type act_cand_; operand_type act_cell_; reg64_t param1{abi_param1}; - xmm_t xmm_src = xmm_t(0); xmm_t xmm_c = xmm_t(1); - xmm_t xmm_i = xmm_t(2); - xmm_t xmm_f = xmm_t(3); + xmm_t xmm_i = xmm_t(6); + xmm_t xmm_f = xmm_t(7); ymm_t ymm_src = ymm_t(0); - ymm_t ymm_c = ymm_t(1); - ymm_t ymm_i = ymm_t(2); - ymm_t ymm_f = ymm_t(3); + ymm_t ymm_c = ymm_t(1); // 2~5 for act + ymm_t ymm_i = ymm_t(6); + ymm_t ymm_f = ymm_t(7); +}; - template - void act(JMM& dst, JMM& src, operand_type type) { // NOLINT - // use 15 - JMM zero = JMM(15); - if (type_ == operand_type::relu) { - vxorps(zero, zero, zero); - } - switch (type) { - case operand_type::relu: - relu_jmm(dst, src, zero); - break; - case operand_type::exp: - exp_jmm(dst, src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(dst, src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(dst, src, 2, 3, 4, 5); - break; - case operand_type::identity: - break; - default: - // throw error - break; +class GRUJitCode : public VActJitCode { + public: + const char* name() const override { + std::string base = "GRUJitCode"; + if (id_ == 0) { + base += "_H1"; + } else if (id_ == 1) { + base += "_HtPart1"; + } else if (id_ == 2) { + base += "_HtPart2"; } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + return base.c_str(); } + + explicit GRUJitCode(int id, const gru_attr_t& attr, + size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + id_(id) { + auto typeExchange = [](const std::string& type) -> gen::operand_type { + if (type == "sigmoid") { + return operand_type::sigmoid; + } else if (type == "relu") { + return operand_type::relu; + } else if (type == "tanh") { + return operand_type::tanh; + } else if (type == "identity" || type == "") { + return operand_type::identity; + } // else throw error + return operand_type::identity; + }; + num_ = attr.d; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + } + static bool init(int d); + void generate() override; + + protected: + int id_; + int num_; + operand_type act_gate_; + operand_type act_cand_; + reg64_t param1{abi_param1}; + + xmm_t xmm_src = xmm_t(0); + xmm_t xmm_c = xmm_t(1); + xmm_t xmm_i = xmm_t(6); + xmm_t xmm_f = xmm_t(7); + + ymm_t ymm_src = ymm_t(0); + ymm_t ymm_c = ymm_t(1); + ymm_t ymm_i = ymm_t(6); + ymm_t ymm_f = ymm_t(7); }; #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index dbfd212e6e..e571d8adf4 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -40,7 +40,7 @@ class LSTMKernelImpl : public LSTMKernel { explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(attr.d)) { - size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); this->ComputeCtHt = jitcode0_->getCode(); @@ -66,7 +66,7 @@ class LSTMKernelImpl : public LSTMKernel { #ifdef PADDLE_WITH_XBYAK template <> bool LSTMKernelImpl::useJIT(int d) { - return false; // not ready yet gen::LSTMJitCode::init(d); + return gen::LSTMJitCode::init(d); } #endif @@ -82,7 +82,7 @@ class PeepholeKernelImpl : public LSTMKernel { explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(attr.d)) { - size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8; jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); this->ComputeCtHt = jitcode0_->getCode(); @@ -175,12 +175,42 @@ class GRUKernelImpl : public GRUKernel { static inline bool useJIT(int d) { return false; } static inline bool useMKL(int d) { return false; } explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096)); + this->ComputeH1 = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096)); + this->ComputeHtPart1 = + jitcode1_->getCode(); + + jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096)); + this->ComputeHtPart2 = + jitcode1_->getCode(); + return; + } +#endif this->ComputeH1 = refer::GRUH1; this->ComputeHtPart1 = refer::GRUHtPart1; this->ComputeHtPart2 = refer::GRUHtPart2; } +#ifdef PADDLE_WITH_XBYAK + + private: + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}, + jitcode2_{nullptr}; +#endif }; +#ifdef PADDLE_WITH_XBYAK +template <> +bool GRUKernelImpl::useJIT(int d) { + return false; // jitcode not ready yet +} +#endif + #define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class) \ template <> \ std::string ker_class##Impl::name(const gru_attr_t& attr) { \ From 510601b2793047858763032b7816af07ab2b2bc7 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 09:01:08 +0000 Subject: [PATCH 051/113] test=develop --- python/paddle/fluid/layers/nn.py | 10 +++++++--- python/paddle/fluid/tests/unittests/test_layers.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 32d411b830..27f83a60bd 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2139,12 +2139,16 @@ def pool2d(input, input tensor is NCHW, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple, + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers, (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int. pool_type: ${pooling_type_comment} - pool_stride (int): stride of the pooling layer. - pool_padding (int): padding size. + pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + it must contain two integers, (pool_stride_Height, pool_stride_Width). + Otherwise, the pool stride size will be a square of an int. + pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple, + it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width). + Otherwise, the pool padding size will be a square of an int. global_pooling (bool): ${global_pooling_comment} use_cudnn (bool): ${use_cudnn_comment} ceil_mode (bool): ${ceil_mode_comment} diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c4310fe006..559c9cda48 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -206,7 +206,12 @@ class TestBook(unittest.TestCase): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') - self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3])) + self.assertIsNotNone( + layers.pool2d( + x, + pool_size=[5, 3], + pool_stride=[1, 2], + pool_padding=(2, 1))) def test_lstm_unit(self): program = Program() From 0c5ed5f6fc2f7d7a8936c70d2005cf3e85c23df6 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 22 Nov 2018 10:04:10 +0000 Subject: [PATCH 052/113] enable peephole jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 28 +++++++++++++++++-- paddle/fluid/operators/math/jit_kernel_rnn.cc | 2 +- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 03b67238fe..95247ce309 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -221,10 +221,14 @@ void LSTMJitCode::generate() { reg64_t reg_ptr_ct_1 = r9; reg64_t reg_ptr_ct = r10; reg64_t reg_ptr_ht = r11; + reg64_t reg_ptr_wp = r12; mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + if (use_peephole_) { + mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]); + } int offset = 0; int d = num_ * sizeof(float); @@ -235,13 +239,27 @@ void LSTMJitCode::generate() { act(ymm_c, ymm_src, act_cand_); // i vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]); + if (!compute_c1h1_ && use_peephole_) { + ymm_t ymm_wp = ymm_t(2); + ymm_t ymm_ct_1 = ymm_t(3); + vmovups(ymm_wp, ptr[reg_ptr_wp + offset]); + vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); + vmulps(ymm_wp, ymm_ct_1, ymm_wp); + vaddps(ymm_src, ymm_src, ymm_wp); + } act(ymm_i, ymm_src, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); if (!compute_c1h1_) { // f vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]); - act(ymm_f, ymm_src, act_gate_); vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); + if (use_peephole_) { + ymm_t ymm_wp = ymm_t(3); + vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d]); + vmulps(ymm_wp, ymm_i, ymm_wp); + vaddps(ymm_src, ymm_src, ymm_wp); + } + act(ymm_f, ymm_src, act_gate_); vmulps(ymm_f, ymm_f, ymm_i); vaddps(ymm_f, ymm_f, ymm_c); } @@ -250,8 +268,14 @@ void LSTMJitCode::generate() { ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; vmovups(ptr[reg_ptr_ct + offset], ymm_ct); // save ct - act(ymm_tmp, ymm_ct, act_cell_); vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]); + if (use_peephole_) { + ymm_t ymm_wp = ymm_t(2); + vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d * 2]); + vmulps(ymm_wp, ymm_ct, ymm_wp); + vaddps(ymm_src, ymm_src, ymm_wp); + } + act(ymm_tmp, ymm_ct, act_cell_); act(ymm_o, ymm_src, act_gate_); vmulps(ymm_o, ymm_tmp, ymm_o); vmovups(ptr[reg_ptr_ht + offset], ymm_o); // save ht diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index e571d8adf4..85ea95cfcc 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -108,7 +108,7 @@ class PeepholeKernelImpl : public LSTMKernel { #ifdef PADDLE_WITH_XBYAK template <> bool PeepholeKernelImpl::useJIT(int d) { - return false; // peephole jitcode not ready yet + return gen::LSTMJitCode::init(d); } #endif From 83370576cd8f35e4155d94a789c886c8c264056d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 22 Nov 2018 18:52:54 +0800 Subject: [PATCH 053/113] Add sqlite3 support in Python3.6 test=develop --- tools/manylinux1/build_scripts/build_utils.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh index d97745ad2d..48cce15a14 100755 --- a/tools/manylinux1/build_scripts/build_utils.sh +++ b/tools/manylinux1/build_scripts/build_utils.sh @@ -50,6 +50,15 @@ function do_cpython_build { mkdir -p ${prefix}/lib # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6 + if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then + wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz + tar -zxf sqlite-autoconf-3250300.tar.gz + cd sqlite-autoconf-3250300 + ./configure --prefix=/usr/local + make -j8 && make install + cd ../ && rm sqlite-autoconf-3250300.tar.gz + fi + # NOTE --enable-shared for generating libpython shared library needed for # linking of some of the nupic.core test executables. if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then @@ -59,9 +68,9 @@ function do_cpython_build { make -j8 > /dev/null make altinstall > /dev/null else - CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null - make -j8 > /dev/null - make install > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null fi popd echo "ZZZ looking for libpython" From 7c8c9dc9bf441ee3360ec416fd71dbf5921ba391 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 22 Nov 2018 19:15:47 +0800 Subject: [PATCH 054/113] fix unit test cases --- cmake/generic.cmake | 11 ++++++-- .../framework/details/all_reduce_op_handle.cc | 4 +-- .../framework/details/all_reduce_op_handle.h | 6 ++--- .../framework/details/broadcast_op_handle.cc | 2 +- .../framework/details/broadcast_op_handle.h | 6 ++--- .../details/broadcast_op_handle_test.h | 12 ++++----- .../fluid/framework/details/build_strategy.cc | 4 +-- .../fluid/framework/details/build_strategy.h | 4 +-- .../details/data_balance_op_handle.cc | 2 +- .../details/data_balance_op_handle.h | 4 +-- .../details/fused_broadcast_op_handle.h | 4 +-- .../details/fused_broadcast_op_handle_test.cc | 4 +-- .../details/multi_devices_graph_pass.cc | 16 +++++------ .../details/multi_devices_graph_pass.h | 2 +- .../framework/details/reduce_op_handle.cc | 2 +- .../framework/details/reduce_op_handle.h | 4 +-- .../details/reduce_op_handle_test.cc | 12 ++++----- .../fluid/framework/ir/is_test_pass_tester.cc | 5 +++- .../inference/analysis/analyzer_tester.cc | 3 ++- paddle/fluid/inference/api/helper.h | 5 +--- .../inference/tests/api/anakin_rnn1_tester.cc | 1 - .../tests/book/test_inference_nlp.cc | 1 - paddle/fluid/inference/tests/test_helper.h | 1 + paddle/fluid/operators/beam_search_op_test.cc | 16 +++++------ .../operators/distributed/grpc_client.cc | 2 +- .../fluid/operators/distributed/grpc_serde.cc | 2 +- .../fluid/operators/distributed/grpc_serde.h | 3 ++- .../operators/distributed/sendrecvop_utils.cc | 2 +- .../operators/distributed/sendrecvop_utils.h | 2 +- paddle/fluid/operators/math/cpu_vec_test.cc | 2 +- paddle/fluid/operators/math/im2col_test.cc | 2 +- .../fluid/operators/math/jit_kernel_test.cc | 2 +- paddle/fluid/platform/cudnn_helper.h | 2 +- paddle/fluid/platform/dynload/cudnn.h | 14 +++++----- paddle/fluid/platform/gpu_info.cc | 5 +++- .../fluid/platform/stream_callback_manager.h | 2 +- paddle/legacy/cuda/include/hl_warpctc_wrap.h | 3 ++- paddle/legacy/cuda/src/hl_cuda_device.cc | 4 +++ paddle/legacy/utils/ThreadLocal.h | 4 ++- paddle/legacy/utils/Util.h | 27 +++++++++++++++++++ paddle/testing/CMakeLists.txt | 6 +++-- python/paddle/fluid/metrics.py | 4 +-- .../fluid/tests/unittests/CMakeLists.txt | 8 +++--- 43 files changed, 138 insertions(+), 89 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 111627a932..cabef3f713 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -349,10 +349,17 @@ function(cc_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WIN32) + list(APPEND win32_deps shlwapi) + if("${cc_test_DEPS};" MATCHES "python;") + list(REMOVE_ITEM cc_test_DEPS python) + list(APPEND win32_deps ${PYTHON_LIBRARIES}) + endif() + endif(WIN32) add_executable(${TARGET_NAME} ${cc_test_SRCS}) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) if(WIN32) - target_link_libraries(${TARGET_NAME} shlwapi) + target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_test(NAME ${TARGET_NAME} @@ -679,7 +686,7 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index b869015676..a003995ae3 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -23,7 +23,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() { } if (platform::is_gpu_place(lod_tensors[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); int dtype = -1; size_t numel = 0; diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index f6ef3a1367..b449796fca 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace framework { namespace details { struct AllReduceOpHandle : public OpHandleBase { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); @@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase { private: std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; #endif }; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 8e5e542765..d98df3bbad 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) VarHandle *out_handle = nullptr; int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 72180fac86..0c75e05f86 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -34,7 +34,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) @@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 4305eb6573..df3b3cc9ca 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -42,7 +42,7 @@ struct TestBroadcastOpHandle { std::vector> nodes_; std::vector place_list_; bool use_gpu_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif @@ -50,7 +50,7 @@ struct TestBroadcastOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -60,7 +60,7 @@ struct TestBroadcastOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -84,7 +84,7 @@ struct TestBroadcastOpHandle { place_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_.reset(nullptr); #endif } @@ -106,14 +106,14 @@ struct TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 37202f8695..70baced0ad 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -96,7 +96,7 @@ std::unique_ptr BuildStrategy::Apply( const std::string &loss_var_name, const std::unordered_set ¶m_names, const std::vector &local_scopes, -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else const bool use_cuda) const { @@ -118,7 +118,7 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index fc2641dbd4..3236c35efd 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -23,7 +23,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -98,7 +98,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::unordered_set ¶m_names, const std::vector &local_scopes, -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; #else const bool use_cuda) const; diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 0b772f9b63..cc562c7b10 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -20,7 +20,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) DataBalanceOpHandle::DataBalanceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h index 0462fb6ec7..2db18a1a72 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.h +++ b/paddle/fluid/framework/details/data_balance_op_handle.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace details { struct DataBalanceOpHandle : public OpHandleBase { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index e37259526a..e43d545c9c 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -25,7 +25,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index 541993c743..be0d941c4f 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not supported."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8c98b78130..26666212ae 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const { places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); strategy_ = Get(kStrategy); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_ = &Get("nccl_ctxs"); #endif @@ -431,7 +431,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } bool use_gpu = false; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) use_gpu = nccl_ctxs_ != nullptr; #endif @@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { void MultiDevSSAGraphBuilder::SetCommunicationContext( OpHandleBase *op_handle, const platform::Place &p) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_ == nullptr) { op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); @@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ir::Graph *result, const std::vector &datas) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); @@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index f3ec2d2941..8e462aec7d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t device_id) const; void Init() const; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) mutable platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 4503123eac..c9f1107aea 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 999828ae45..846839029c 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 72299c0bfa..6cee4770e6 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -35,7 +35,7 @@ struct TestReduceOpHandle { std::vector gpu_list_; std::vector> ctxs_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif @@ -43,7 +43,7 @@ struct TestReduceOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -53,7 +53,7 @@ struct TestReduceOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -77,7 +77,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_.reset(nullptr); #endif } @@ -99,14 +99,14 @@ struct TestReduceOpHandle { nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index cd2cb0c9f8..9696441a21 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -15,7 +15,10 @@ #include "paddle/fluid/framework/ir/is_test_pass.h" #include - +#ifdef _WIN32 +#undef FALSE +#undef TRUE +#endif namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 84a0c3374c..7710ed7b61 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { @@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) { 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) { LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 6f9d663121..9a393a61c4 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,10 +15,6 @@ #pragma once #include -#if !defined(_WIN32) -#include -#else -#endif #include #include // NOLINT @@ -28,6 +24,7 @@ #include #include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index c4022225fd..da42688f29 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include #include #include diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index cbcfc964c9..5c1204b9e6 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include // NOLINT diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 2118fcfd4b..75fa611c0d 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(use_mkldnn); diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 501807e7f3..80fdd22fbb 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -30,23 +30,23 @@ using std::endl; void CreateInput(LoDTensor* ids, LoDTensor* scores) { LoD lod; - vector level0({0, 2, 4}); - vector level1({0, 1, 2, 3, 4}); + vector level0{0, 2, 4}; + vector level1{0, 1, 2, 3, 4}; lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); scores->set_lod(lod); - auto dims = framework::make_ddim(vector({4, 3})); + auto dims = framework::make_ddim(vector{4, 3}); ids->Resize(dims); scores->Resize(dims); CPUPlace place; auto* ids_data = ids->mutable_data(place); auto* scores_data = scores->mutable_data(place); - vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores( - {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1}); + vector _ids{4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}; + vector _scores{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, + 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}; for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; @@ -79,8 +79,8 @@ TEST(DISABLED_beam_search_op, run) { ASSERT_EQ(sids.lod(), sscores.lod()); - vector tids({4, 2, 3, 8}); - vector tscores({0.5, 0.6, 0.9, 0.7}); + vector tids{4, 2, 3, 8}; + vector tscores{0.5f, 0.6f, 0.9f, 0.7f}; for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index c28f86146d..3548d5d9fb 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "glog/logging.h" // For VLOG @@ -20,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index f27b70a5a3..e6856676d4 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -26,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h index 7ec489e961..17290d3fb4 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.h +++ b/paddle/fluid/operators/distributed/grpc_serde.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include + #include #include #include @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 374fa680e3..0abebb9240 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -15,12 +15,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 480fc59c42..523e56fe3e 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include #include #include #include @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 18a586f8dd..ad734bae42 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include @@ -22,6 +21,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/port.h" inline double GetCurrentUS() { struct timeval time; diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index ae2c90b33a..521cd7801a 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include -#include #include #include "paddle/fluid/operators/math/im2col_cfo_cpu.h" +#include "paddle/fluid/platform/port.h" template void testIm2col() { diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index b6c62a2634..8662e1c50d 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" -#include #include // for exp #include // for memcpy #include @@ -22,6 +21,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/platform/port.h" #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 682b0c0ff3..61a25064d1 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { #define CUDNN_ENFORCE(condition) \ do { \ - cudnnStatus_t status = condition; \ + auto status = condition; \ if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \ PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ } \ diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 1a83ac7780..db62377898 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -48,13 +48,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #else -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline cudnnStatus_t operator()(Args... args) { \ - return ::__name(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ extern DynLoad__##__name __name #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c78f159ad2..e0d0051ad0 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -19,7 +19,10 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" -DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, +// fraction_of_gpu_memory_to_use cannot be too high on windows, +// since the win32 graphic sub-system can occupy some GPU memory +// which may lead to insufficient memory left for paddle +DEFINE_double(fraction_of_gpu_memory_to_use, 0.5, "Allocate a trunk of gpu memory that is this fraction of the " "total gpu memory size. Future memory usage will be allocated " "from the trunk. If the trunk doesn't have enough gpu memory, " diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 11c68f3449..8dcfc4e748 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,7 +18,7 @@ #include #include #include -#include "ThreadPool.h" +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/legacy/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h index 0857bd1aa1..09cbd6d450 100644 --- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h +++ b/paddle/legacy/cuda/include/hl_warpctc_wrap.h @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef _WIN32 #ifndef HL_WARPCTC_WRAP_H_ #define HL_WARPCTC_WRAP_H_ - #include "ctc.h" #include "hl_base.h" @@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths, size_t* bytes); #endif // HL_WARPCTC_WRAP_H_ +#endif diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc index 501e3b0f3b..a6e27a37ff 100644 --- a/paddle/legacy/cuda/src/hl_cuda_device.cc +++ b/paddle/legacy/cuda/src/hl_cuda_device.cc @@ -132,11 +132,15 @@ inline pid_t gettid() { uint64_t tid; pthread_threadid_np(NULL, &tid); #else +#ifndef _WIN32 #ifndef __NR_gettid #define __NR_gettid 224 #endif pid_t tid = syscall(__NR_gettid); #endif +#else // _WIN32 + pid_t tid = _getpid(); +#endif // _WIN32 CHECK_NE((int)tid, -1); return tid; } diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h index c5b07506d3..6268b73a85 100644 --- a/paddle/legacy/utils/ThreadLocal.h +++ b/paddle/legacy/utils/ThreadLocal.h @@ -14,10 +14,12 @@ limitations under the License. */ #pragma once +#ifndef _WIN32 #include #include -#include #include +#endif +#include #include #include #include diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h index e6f05e30d3..3a878b2b30 100644 --- a/paddle/legacy/utils/Util.h +++ b/paddle/legacy/utils/Util.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#ifndef _WIN32 #include // for syscall() +#endif #include #include #include @@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) { } #endif +#ifdef _WIN32 +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#include + +template +inline int __builtin_clz(const T& value) { + DWORD leadning_zero = 0; + if (_BitScanReverse(&leadning_zero, value)) { + return static_cast(sizeof(T) * 8 - leadning_zero); + } else { + return static_cast(0); + } +} + +inline int __builtin_clzl(const unsigned long& value) { + return __builtin_clz(value); +} + +inline int __builtin_clzll(const unsigned long long& value) { + return __builtin_clz(value); +} + +#define pid_t int +#endif + /** * Loop over the elements in a container * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach, diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 2264481899..614596958e 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -3,8 +3,10 @@ if(WITH_TESTING) add_library(paddle_test_main STATIC TestMain.cpp) add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies}) - add_library(paddle_test_util STATIC TestUtil.cpp) - add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) + if(NOT WIN32) + add_library(paddle_test_util STATIC TestUtil.cpp) + add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) + endif(NOT WIN32) if(NOT MOBILE_INFERENCE) cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags) endif() diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index f65b37903a..829154f1b2 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -46,8 +46,8 @@ def _is_numpy_(var): def _is_number_(var): - return isinstance(var, int) or isinstance(var, float) or (isinstance( - var, np.ndarray) and var.shape == (1, )) + return isinstance(var, int) or isinstance(var, np.int64) or isinstance( + var, float) or (isinstance(var, np.ndarray) and var.shape == (1, )) def _is_number_or_matrix_(var): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 510d0304f0..3fc12d584d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -23,9 +23,11 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) endif(NOT WITH_DISTRIBUTE) -if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) -endif() +if(WITH_GPU) + if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) + endif() +endif(WITH_GPU) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 From e280c7a4db7f5765e7b3b5b2146204705b348e5b Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 22 Nov 2018 19:52:10 +0800 Subject: [PATCH 055/113] code style fix test=develop --- paddle/fluid/platform/stream_callback_manager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 8dcfc4e748..ed8734c98c 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -14,11 +14,11 @@ #pragma once +#include #include #include #include #include -#include #include "paddle/fluid/platform/enforce.h" namespace paddle { From 00b9e9a1357bb3fa6e6adceb4e650d9f6424aa2a Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 22 Nov 2018 20:40:56 +0800 Subject: [PATCH 056/113] Refine cublas to support CUBLAS_TENSOR_OP_MATH (#13929) * refine cublase test=develop * code refine * refine cublas * add GEMME_EX * add enable_cublas_tensor_op_math doc and add cublasCall test=develop * fix CublasCall for cuda version test=develop * fix error test=develop * fix GEMM_EX to be compatible with gcc 4.8 test=develop * add GEMM_EX test=develop * to compatiable with gcc4.8 test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 206 +++++++++++++++++---- paddle/fluid/platform/device_context.h | 47 +++++ paddle/fluid/platform/dynload/cublas.h | 26 ++- paddle/fluid/platform/gpu_info.cc | 20 ++ paddle/fluid/platform/gpu_info.h | 3 + python/paddle/fluid/__init__.py | 3 +- 6 files changed, 256 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d84c88cb3b..d35073029a 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -16,6 +16,9 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_bool(enable_cublas_tensor_op_math); namespace paddle { namespace operators { @@ -42,11 +45,44 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...)); #else PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const float *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const float *beta, void *C, + cudaDataType_t Ctype, int ldc) { + // Because the gcc 4.8 doesn't expand template parameter pack that + // appears in a lambda-expression, I can not use template parameter pack + // here. + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + VLOG(5) << "use_tensor_op_math: " + << (platform::TensorCoreAvailable() ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc)); +#else + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -69,13 +105,18 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...)); #else PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5"); #endif } + + template + static void GEMM_EX(ARGS... args) { + PADDLE_THROW("Currently there are not cublasDgemmEx."); + } }; template <> @@ -96,14 +137,16 @@ struct CUBlas { reinterpret_cast<__half *>(C), ldc)); } - static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const float16 *alpha, const float16 *A, int lda, - long long int strideA, const float16 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const float16 *beta, float16 *C, int ldc, - long long int strideC, // NOLINT - int batchCount) { + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float16 *alpha, const float16 *A, + int lda, long long int strideA, // NOLINT + const float16 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const float16 *beta, float16 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( handle, transa, transb, m, n, k, @@ -114,6 +157,45 @@ struct CUBlas { ldc, strideC, batchCount)); #else PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const void *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const void *beta, void *C, + cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); +#else + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -133,8 +215,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, N); +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, N); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, N); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 } template <> @@ -157,30 +252,18 @@ inline void Blas::GEMM( PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53, "cublas fp16 gemm requires GPU compute capability >= 53"); -#if CUDA_VERSION >= 8000 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - if (context_.GetComputeCapability() >= 70) { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH)); - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } else { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_DEFAULT_MATH)); - } -#endif // CUDA_VERSION >= 9000 - +#if CUDA_VERSION >= 8000 // cublasHgemm does true FP16 computation which is slow for non-Volta // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B, - CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, - CUDA_R_32F, algo)); + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A, + CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, @@ -199,8 +282,38 @@ void Blas::GEMM(bool transA, bool transB, int M, // the cblas convention. cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, ldc); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + +template <> +template <> +inline void Blas::GEMM( + bool transA, bool transB, int M, int N, int K, platform::float16 alpha, + const platform::float16 *A, int lda, const platform::float16 *B, int ldb, + platform::float16 beta, platform::float16 *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc); } template <> @@ -238,9 +351,34 @@ void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; - CUBlas::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, - strideC, batchCount); +#if CUDA_VERSION >= 9010 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto cublas_call = [&]() { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, + CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + }; + auto &dev_ctx = const_cast(context_); + dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + } else { +#endif // CUDA_VERSION >= 9010 + + CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, strideB, A, lda, + strideA, &beta, C, ldc, strideC, batchCount); + +#if CUDA_VERSION >= 9010 + } +#endif // CUDA_VERSION >= 9010 } } // namespace math diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 9a9018cdea..3edd727978 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -143,6 +143,39 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; +#if CUDA_VERSION >= 9000 +class ScopedCublasMathMode { + public: + ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) + : handle_(handle) { + need_reset = false; + PADDLE_ENFORCE( + platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), + "Failed to get old cublas math mode"); + if (old_math_mode_ != new_math_mode) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, new_math_mode), + "Failed to set old cublas math mode"); + need_reset = true; + } + } + + ~ScopedCublasMathMode() { + if (need_reset) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, old_math_mode_), + "Failed to set old cublas math mode"); + } + } + + private: + cublasHandle_t handle_; + cublasMath_t old_math_mode_; + bool need_reset; +}; + +#endif + class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext { callback_manager_->Wait(); } +#if CUDA_VERSION >= 9000 + /*! \brief CublasCall may need to change cublas's config, + * but the cublas may be hold by multi-thread, so we should + * add lock here. */ + template + void CublasCall(Callback callback, cublasMath_t new_math) { + std::lock_guard guard(cublas_mtx_); + ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); + callback(); + } +#endif + private: CUDAPlace place_; @@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext { // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes mutable std::mutex callback_mtx_; std::unique_ptr callback_manager_; + + mutable std::mutex cublas_mtx_; }; template <> diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 4ea0cd7283..ff80bd525c 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -61,9 +61,6 @@ extern void *cublas_dso_handle; extern DynLoad__##__name __name #endif -#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ - DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) - #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ __macro(cublasSaxpy_v2); \ __macro(cublasDaxpy_v2); \ @@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 -#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ - __macro(cublasGemmEx); \ - __macro(cublasSgemmStridedBatched); \ - __macro(cublasDgemmStridedBatched); \ - __macro(cublasCgemmStridedBatched); \ - __macro(cublasZgemmStridedBatched); \ - __macro(cublasHgemmStridedBatched); - -CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched); #endif // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode); +#endif -CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#if CUDA_VERSION >= 9010 +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx); #endif #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c78f159ad2..833d48347f 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_bool( + enable_cublas_tensor_op_math, false, + "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " + "but it may loss precision. Currently, There are two CUDA libraries that" + " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up" + " GEMM computations(the matrices must be either half precision or single " + "precision); cuDNN uses Tensor Cores to speed up both convolutions(the " + "input and output must be half precision) and recurrent neural networks " + "(RNNs)."); + namespace paddle { namespace platform { @@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) { return driver_version; } +bool TensorCoreAvailable() { +#if CUDA_VERSION >= 9000 + int device = GetCurrentDeviceId(); + int driver_version = GetCUDAComputeCapability(device); + return driver_version >= 70; +#else + return false; +#endif +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index be44158431..6a0b3c8e02 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id); //! Get the driver version of the ith GPU int GetCUDADriverVersion(int id); +//! Wheter the current device support TensorCore +bool TensorCoreAvailable(); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 543acf2d34..3c092dee34 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -133,7 +133,8 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', - 'conv_workspace_size_limit', 'cudnn_exhaustive_search' + 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', + 'cudnn_exhaustive_search' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) From 6cc6bf4074d69c5c0b02af612b94e438d596803a Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Thu, 22 Nov 2018 15:30:43 +0100 Subject: [PATCH 057/113] Bumped MKL-DNN version to 0.17 test=develop --- cmake/external/mkldnn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 785148d4f9..b280db23b9 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -53,7 +53,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "21fb5f2af1dd14e132af4f1b79160977ee487818" + GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} From a902b8b0f811f6837330385b95fa2f552393197c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 01:07:58 +0800 Subject: [PATCH 058/113] Add sqlite3 support test=develop --- tools/manylinux1/Dockerfile.x64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index e91216a5b8..48fd145e5f 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ - bash build_scripts/install_nccl2.sh && rm -r build_scripts + bash build_scripts/install_nccl2.sh && rm -rf build_scripts ENV SSL_CERT_FILE=/opt/_internal/certs.pem From 81bd7eeff4f3581c67cb294f94a14c3b1e97e40d Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 23 Nov 2018 11:04:11 +0800 Subject: [PATCH 059/113] rollback the format --- paddle/fluid/operators/beam_search_op_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 80fdd22fbb..6e283866ff 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -30,23 +30,23 @@ using std::endl; void CreateInput(LoDTensor* ids, LoDTensor* scores) { LoD lod; - vector level0{0, 2, 4}; - vector level1{0, 1, 2, 3, 4}; + vector level0({0, 2, 4}); + vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); scores->set_lod(lod); - auto dims = framework::make_ddim(vector{4, 3}); + auto dims = framework::make_ddim(vector({4, 3})); ids->Resize(dims); scores->Resize(dims); CPUPlace place; auto* ids_data = ids->mutable_data(place); auto* scores_data = scores->mutable_data(place); - vector _ids{4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}; - vector _scores{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, - 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}; + vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + vector _scores({0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, + 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; @@ -79,8 +79,8 @@ TEST(DISABLED_beam_search_op, run) { ASSERT_EQ(sids.lod(), sscores.lod()); - vector tids{4, 2, 3, 8}; - vector tscores{0.5f, 0.6f, 0.9f, 0.7f}; + vector tids({4, 2, 3, 8}); + vector tscores({0.5f, 0.6f, 0.9f, 0.7f}); for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); From ff2a9786f3f8ba49daad21bd3d4550b394d46ca4 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 03:08:33 +0000 Subject: [PATCH 060/113] test=develop --- python/paddle/fluid/__init__.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 543acf2d34..a6416326ed 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -91,6 +91,7 @@ def __bootstrap__(): """ import sys import os + import platform from . import core in_test = 'unittest' in sys.modules @@ -110,14 +111,17 @@ def __bootstrap__(): print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr) os.environ['OMP_NUM_THREADS'] = str(num_threads) - + sysstr = platform.system() read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', - 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', + 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', + 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', + 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", + 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] + if sysstr != 'Darwin': + read_env_flags.append('use_pinned_memory') + if os.name != 'nt': read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') From 9ea1ce63192fee1a211aa5dcc6fecf4758434451 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 23 Nov 2018 11:51:07 +0800 Subject: [PATCH 061/113] Update issue templates --- .github/ISSUE_TEMPLATE/---feature-request-.md | 27 +++++++++++++ .github/ISSUE_TEMPLATE/---inference-issue-.md | 40 +++++++++++++++++++ .../ISSUE_TEMPLATE/---installation-issue-.md | 40 +++++++++++++++++++ .github/ISSUE_TEMPLATE/---model-issue-.md | 36 +++++++++++++++++ .github/ISSUE_TEMPLATE/---others-.md | 33 +++++++++++++++ .github/ISSUE_TEMPLATE/---training-issue-.md | 38 ++++++++++++++++++ 6 files changed, 214 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/---feature-request-.md create mode 100644 .github/ISSUE_TEMPLATE/---inference-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---installation-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---model-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---others-.md create mode 100644 .github/ISSUE_TEMPLATE/---training-issue-.md diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md new file mode 100644 index 0000000000..57708855dc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---feature-request-.md @@ -0,0 +1,27 @@ +--- +name: 建议(Feature request) +about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue. + +--- + +欢迎您对PaddlePaddle提出建议,非常感谢您对PaddlePaddle的贡献! +在留下您的建议时,辛苦您同步提供如下信息: +- 版本、环境信息 +1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1 +2)CPU/GPU:您是否使用GPU进行训练,如是,请提供您的CUDA和cuDNN版本号 +3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 建议描述:请您详细描述,您认为需优化的功能 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +Please make sure that this is a feature request. +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +**To Reproduce** +Steps to reproduce the behavior +**Describe the feature and the current behavior/state.** +**Any Other info.** diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md new file mode 100644 index 0000000000..37bdc8889e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md @@ -0,0 +1,40 @@ +--- +name: 预测(Inference Issue) +about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准描述您的问题,例如“最新预测库的API文档在哪儿 ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本(如Mac OS 10.14),Python版本 +-预测信息 +    1)C++预测:请您提供预测库安装包的版本信息,及其中的version.txt文件 +    2)CMake包含路径的完整命令 +    3)API信息(如调用请提供) +    4)预测库来源:官网下载/特殊环境(如BCLOUD编译) +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that th +If there is no solution,please make sure that this is an inference issue including the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Cmake orders +-C++version.txt +-API information +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md new file mode 100644 index 0000000000..ce4ba58932 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md @@ -0,0 +1,40 @@ +--- +name: 安装(Installation Issue) +about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation +  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:请包含关键词“安装错误”/“编译错误”,例如“Mac编译错误” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(如Mac OS 10.14)、Python版本 +- 安装方式信息: +1)pip安装/docker安装 +2)本地编译:请提供cmake命令,编译命令 +3)docker编译:请提供docker镜像,编译命令            +  特殊环境请注明:如离线安装等 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is an installation issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg. Mac OS 10.14) +-Python version +- Install method: pip install/install with docker/build from source(without docker)/build within docker +- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md new file mode 100644 index 0000000000..7cb52f37b9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---model-issue-.md @@ -0,0 +1,36 @@ +--- +name: 模型(Model Issue) +about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/ + algorithm/dataset  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错  ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(例如Mac OS 10.14),Python版本 +- 模型信息 +    1)模型名称 2)使用数据集名称 3)使用算法名称 4)模型链接 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a issue of models including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Name of Models&Dataset/details of operator +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md new file mode 100644 index 0000000000..6a291153e4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---others-.md @@ -0,0 +1,33 @@ +--- +name: 其他(Others) +about: 如上述分类未包含您的问题,可在此提出。 You could use this template for reporting other issues + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题 +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU/GPU:如果您使用GPU训练,请提供GPU驱动版本、CUDA和cuDNN版本号 +    3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +    4)Python版本号 +    5)显存信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please provide us with the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/cuDNN version +-OS Platform and Distribution(eg.Mac OS 10.14) +-Python version +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md new file mode 100644 index 0000000000..29e8383d97 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---training-issue-.md @@ -0,0 +1,38 @@ +--- +name: 训练(Training issue) +about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training +  issue. + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本 +- 训练信息 +    1)单机/多机,单卡/多卡 +    2)显存信息 +    3)Operator信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a training issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Other imformation: Distriuted training/informantion of operator/ +Graphics card storage +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** From 6a7f83d45df2ff22c49867837c97f0773421ee0c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 23 Nov 2018 04:11:28 +0000 Subject: [PATCH 062/113] enable gru jitcode and refine act and lstm jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 183 ++++++++++-------- paddle/fluid/operators/math/jit_code.h | 90 ++++----- .../fluid/operators/math/jit_kernel_refer.h | 4 +- paddle/fluid/operators/math/jit_kernel_rnn.cc | 6 +- .../fluid/operators/math/jit_kernel_test.cc | 2 + 5 files changed, 149 insertions(+), 136 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 95247ce309..52cbdf685d 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -140,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) { } void VActJitCode::generate() { - xmm_t xmm_zero = xmm_t(2); - ymm_t ymm_zero = ymm_t(2); - if (type_ == operand_type::relu) { - vxorps(ymm_zero, ymm_zero, ymm_zero); - } int offset = 0; for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { vmovups(ymm_src, ptr[param1 + offset]); - switch (type_) { - case operand_type::relu: - relu_jmm(ymm_dst, ymm_src, ymm_zero); - break; - case operand_type::exp: - exp_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::identity: - break; - default: - break; - } + act(ymm_dst, ymm_src, type_); vmovups(ptr[param2 + offset], ymm_dst); offset += sizeof(float) * YMM_FLOAT_BLOCK; } @@ -182,22 +160,7 @@ void VActJitCode::generate() { block = 1; vmovss(xmm_src, ptr[param1 + offset]); } - switch (type_) { - case operand_type::relu: - relu_jmm(xmm_dst, xmm_src, xmm_zero); - break; - case operand_type::exp: - exp_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - default: - break; - } + act(xmm_dst, xmm_src, type_); if (rest >= 4) { vmovups(ptr[param2 + offset], xmm_dst); } else if (rest >= 2) { @@ -233,52 +196,64 @@ void LSTMJitCode::generate() { int offset = 0; int d = num_ * sizeof(float); for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - // c - vmovups(ymm_src, ptr[reg_ptr_gates + offset]); - act(ymm_c, ymm_src, act_cand_); - // i - vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]); - if (!compute_c1h1_ && use_peephole_) { - ymm_t ymm_wp = ymm_t(2); - ymm_t ymm_ct_1 = ymm_t(3); - vmovups(ymm_wp, ptr[reg_ptr_wp + offset]); + /* gates: W_ch, W_ih, W_fh, W_oh */ + ymm_t ymm_c = ymm_t(0); + ymm_t ymm_i = ymm_t(1); + ymm_t ymm_f = ymm_t(2); + ymm_t ymm_o = ymm_t(3); + ymm_t ymm_ct_1 = ymm_t(4); + ymm_t ymm_wp0 = ymm_t(5); + ymm_t ymm_wp1 = ymm_t(6); + ymm_t ymm_wp2 = ymm_t(7); + vmovups(ymm_c, ptr[reg_ptr_gates + offset]); + vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]); + vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]); + vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]); + if (!compute_c1h1_) { vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); - vmulps(ymm_wp, ymm_ct_1, ymm_wp); - vaddps(ymm_src, ymm_src, ymm_wp); } - act(ymm_i, ymm_src, act_gate_); + if (use_peephole_) { + vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]); + vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]); + vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]); + } + /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */ + // act_cand(c) + act(ymm_c, ymm_c, act_cand_); + // act_gate(i) or act_gate(ct_1 * wp0 + i) + if (!compute_c1h1_ && use_peephole_) { + vmulps(ymm_wp0, ymm_ct_1, ymm_wp0); + vaddps(ymm_i, ymm_i, ymm_wp0); + } + act(ymm_i, ymm_i, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); if (!compute_c1h1_) { - // f - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]); - vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); + // act_gate(f) or act_gate(ct_1 * wp1 + f) if (use_peephole_) { - ymm_t ymm_wp = ymm_t(3); - vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d]); - vmulps(ymm_wp, ymm_i, ymm_wp); - vaddps(ymm_src, ymm_src, ymm_wp); + vmulps(ymm_wp1, ymm_ct_1, ymm_wp1); + vaddps(ymm_f, ymm_f, ymm_wp1); } - act(ymm_f, ymm_src, act_gate_); - vmulps(ymm_f, ymm_f, ymm_i); + act(ymm_f, ymm_f, act_gate_); + // ct + vmulps(ymm_f, ymm_f, ymm_ct_1); vaddps(ymm_f, ymm_f, ymm_c); } - /* H_t = act_cell(C_t) * ogated */ + /* H_t = act_cell(C_t) * act_gate(o) */ + // act_cell(C_t) ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; - ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; - vmovups(ptr[reg_ptr_ct + offset], ymm_ct); // save ct - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]); + act(ymm_tmp, ymm_ct, act_cell_); + // act_gate(o) or act_gate(ct * wp2 + o) if (use_peephole_) { - ymm_t ymm_wp = ymm_t(2); - vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d * 2]); - vmulps(ymm_wp, ymm_ct, ymm_wp); - vaddps(ymm_src, ymm_src, ymm_wp); + vmulps(ymm_wp2, ymm_ct, ymm_wp2); + vaddps(ymm_o, ymm_o, ymm_wp2); } - act(ymm_tmp, ymm_ct, act_cell_); - act(ymm_o, ymm_src, act_gate_); - vmulps(ymm_o, ymm_tmp, ymm_o); - vmovups(ptr[reg_ptr_ht + offset], ymm_o); // save ht + act(ymm_o, ymm_o, act_gate_); + // ht + vmulps(ymm_o, ymm_o, ymm_tmp); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); offset += sizeof(float) * YMM_FLOAT_BLOCK; } @@ -293,13 +268,61 @@ bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } void GRUJitCode::generate() { reg64_t reg_ptr_gates = rax; - reg64_t reg_ptr_ct_1 = r9; - reg64_t reg_ptr_ct = r10; - reg64_t reg_ptr_ht = r11; - mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); - mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); - mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); - mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + reg64_t reg_ptr_ht_1 = r9; + reg64_t reg_ptr_ht = r10; + mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]); + mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]); + ymm_t ymm_one = ymm_t(0); + + if (id_ == 2) { + reg64_t reg_ptr_tmp = r11; + mov(reg_ptr_tmp, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]); + } + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + ymm_t ymm_u = ymm_t(1); + ymm_t ymm_r = ymm_t(2); + ymm_t ymm_s = ymm_t(3); + ymm_t ymm_ht_1 = ymm_t(4); + // W: {W_update, W_reset; W_state} + if (id_ == 0 || id_ == 2) { + vmovups(ymm_u, ptr[reg_ptr_gates + offset]); + vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]); + } + if (id_ == 1) { + vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]); + } + if (id_ == 1 || id_ == 2) { + vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]); + } + + if (id_ == 0) { + // ht = act_gate(u) * act_cand(s) + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_s); + } else if (id_ == 1) { + // ht = act_gate(r) * ht_1 + act(ymm_r, ymm_r, act_gate_); + vmulps(ymm_r, ymm_r, ymm_ht_1); + vmovups(ptr[reg_ptr_ht + offset], ymm_r); + } else if (id_ == 2) { + // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 + ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx()); + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vsubps(ymm_u, ymm_one_inner, ymm_u); + vmulps(ymm_u, ymm_ht_1, ymm_u); + vaddps(ymm_u, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_u); + } + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } ret(); } diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 403cea3991..a921462129 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -169,31 +169,34 @@ class VActJitCode : public JitCode { protected: // compute relu with ymm, xmm template - void relu_jmm(JMM& dst, JMM& src, JMM& zero) { // NOLINT + void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); vmaxps(dst, src, zero); } // compute exp with ymm, xmm template - void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT - int mask_idx = 4, int tmp_idx = 5) { - using namespace platform::jit; // NOLINT - assert(src.getIdx() != dst.getIdx()); // TODO(TJ): use enfore + void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT + int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { + using namespace platform::jit; // NOLINT // check all idx can not equal + JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); JMM jmm_fy = JMM(fy_idx); JMM jmm_mask = JMM(mask_idx); JMM jmm_tmp = JMM(tmp_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); - vminps(src, src, jmm_tmp); + vminps(jmm_src, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); - vmaxps(src, src, jmm_tmp); + vmaxps(jmm_src, jmm_src, jmm_tmp); // express exp(x) as exp(g + n*log(2)) vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); - vmulps(jmm_fx, src, jmm_tmp); + vmulps(jmm_fx, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); vaddps(jmm_fx, jmm_fx, jmm_tmp); vroundps(jmm_fy, jmm_fx, 0x01); @@ -207,21 +210,21 @@ class VActJitCode : public JitCode { vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); JMM ymm_z = JMM(jmm_mask.getIdx()); vmulps(ymm_z, jmm_fx, jmm_tmp); - vsubps(src, src, jmm_fy); - vsubps(src, src, ymm_z); - vmulps(ymm_z, src, src); + vsubps(jmm_src, jmm_src, jmm_fy); + vsubps(jmm_src, jmm_src, ymm_z); + vmulps(ymm_z, jmm_src, jmm_src); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); - vmulps(dst, src, jmm_tmp); + vmulps(dst, jmm_src, jmm_tmp); for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; i += (YMM_FLOAT_BLOCK * sizeof(float))) { vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 vaddps(dst, dst, jmm_tmp); - vmulps(dst, dst, src); + vmulps(dst, dst, jmm_src); } vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); vaddps(dst, dst, jmm_tmp); vmulps(dst, dst, ymm_z); - vaddps(dst, dst, src); + vaddps(dst, dst, jmm_src); vmovaps(jmm_tmp, ptr[reg_ptr_global]); vaddps(dst, dst, jmm_tmp); // build 2^n @@ -258,20 +261,23 @@ class VActJitCode : public JitCode { // compute sigmoid with ymm, xmm template - void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2, // NOLINT - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) { + void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { // y = 1 / (1 + e^-x) JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_src = JMM(src_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); - vminps(src, src, jmm_tmp); + vminps(jmm_src, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); - vmaxps(src, src, jmm_tmp); + vmaxps(jmm_src, jmm_src, jmm_tmp); vxorps(jmm_tmp, jmm_tmp, jmm_tmp); - vsubps(src, jmm_tmp, src); - exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vsubps(jmm_src, jmm_tmp, jmm_src); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(dst, dst, jmm_tmp); vdivps(dst, jmm_tmp, dst); @@ -280,19 +286,22 @@ class VActJitCode : public JitCode { // compute tanh with ymm, xmm template - void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT - int mask_idx = 4, int tmp_idx = 5) { + void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { // y = 2 / (1 + e^(-2x)) - 1 + JMM jmm_src = JMM(src_idx); JMM jmm_tmp = JMM(tmp_idx); JMM jmm_zero = JMM(mask_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); vxorps(jmm_zero, jmm_zero, jmm_zero); vsubps(jmm_tmp, jmm_zero, jmm_tmp); - vmulps(src, src, jmm_tmp); - exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmulps(jmm_src, jmm_src, jmm_tmp); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(dst, dst, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); @@ -304,23 +313,19 @@ class VActJitCode : public JitCode { template void act(JMM& dst, JMM& src, operand_type type) { // NOLINT - // use 15 - JMM zero = JMM(15); - if (type_ == operand_type::relu) { - vxorps(zero, zero, zero); - } + // use 11~15 switch (type) { case operand_type::relu: - relu_jmm(dst, src, zero); + relu_jmm(dst, src, 15); break; case operand_type::exp: - exp_jmm(dst, src, 2, 3, 4, 5); + exp_jmm(dst, src, 11, 12, 13, 14, 15); break; case operand_type::sigmoid: - sigmoid_jmm(dst, src, 2, 3, 4, 5); + sigmoid_jmm(dst, src, 11, 12, 13, 14, 15); break; case operand_type::tanh: - tanh_jmm(dst, src, 2, 3, 4, 5); + tanh_jmm(dst, src, 11, 12, 13, 14, 15); break; case operand_type::identity: break; @@ -414,15 +419,6 @@ class LSTMJitCode : public VActJitCode { operand_type act_cand_; operand_type act_cell_; reg64_t param1{abi_param1}; - xmm_t xmm_src = xmm_t(0); - xmm_t xmm_c = xmm_t(1); - xmm_t xmm_i = xmm_t(6); - xmm_t xmm_f = xmm_t(7); - - ymm_t ymm_src = ymm_t(0); - ymm_t ymm_c = ymm_t(1); // 2~5 for act - ymm_t ymm_i = ymm_t(6); - ymm_t ymm_f = ymm_t(7); }; class GRUJitCode : public VActJitCode { @@ -492,16 +488,6 @@ class GRUJitCode : public VActJitCode { operand_type act_gate_; operand_type act_cand_; reg64_t param1{abi_param1}; - - xmm_t xmm_src = xmm_t(0); - xmm_t xmm_c = xmm_t(1); - xmm_t xmm_i = xmm_t(6); - xmm_t xmm_f = xmm_t(7); - - ymm_t ymm_src = ymm_t(0); - ymm_t ymm_c = ymm_t(1); - ymm_t ymm_i = ymm_t(6); - ymm_t ymm_f = ymm_t(7); }; #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 2e1a7f22db..bcb6615df8 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -206,7 +206,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); auto act_gate = getActFunc(attr->act_gate); - act_gate(gates, gates, attr->d * 2); + act_gate(gates + attr->d, gates + attr->d, attr->d); VMul(ht_1, gates + attr->d, ht, attr->d); } @@ -215,9 +215,11 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { T* gates = reinterpret_cast(step->gates); T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); auto act_cand = getActFunc(attr->act_cand); int d = attr->d; T* y = gates + d * 2; + act_gate(gates, gates, d); act_cand(y, y, d); // out = zt*ht~ + (1-zt)*ht_1 for (int i = 0; i < d; ++i) { diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index 85ea95cfcc..2db3274a45 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -177,7 +177,7 @@ class GRUKernelImpl : public GRUKernel { explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(attr.d)) { - size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096)); this->ComputeH1 = jitcode0_->getCode(); @@ -188,7 +188,7 @@ class GRUKernelImpl : public GRUKernel { jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096)); this->ComputeHtPart2 = - jitcode1_->getCode(); + jitcode2_->getCode(); return; } #endif @@ -207,7 +207,7 @@ class GRUKernelImpl : public GRUKernel { #ifdef PADDLE_WITH_XBYAK template <> bool GRUKernelImpl::useJIT(int d) { - return false; // jitcode not ready yet + return gen::GRUJitCode::init(d); } #endif diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 1cbe1b5d95..cc8a5d4d86 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -714,6 +714,8 @@ TEST(JitKernel, pool) { std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); + // empty call it to avoid unknown flag 'use_pinned_memory' on Mac + paddle::platform::jit::MayIUse(paddle::platform::jit::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); From 36f08eef3b466001f339e2c33f47dac60bbc6821 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 23 Nov 2018 13:04:41 +0800 Subject: [PATCH 063/113] CUDA kernel for density_prior_box_op. (#14513) * CUDA kernel for density_prior_box_op. * Support flatten to 2D. --- paddle/fluid/API.spec | 2 +- paddle/fluid/framework/op_desc.cc | 6 + .../fluid/operators/detection/CMakeLists.txt | 2 +- .../detection/density_prior_box_op.cc | 36 ++-- .../detection/density_prior_box_op.cu | 170 ++++++++++++++++++ .../detection/density_prior_box_op.h | 73 ++++---- python/paddle/fluid/layers/detection.py | 43 +++-- python/paddle/fluid/tests/test_detection.py | 60 ++++--- .../unittests/test_density_prior_box_op.py | 30 ++-- 9 files changed, 305 insertions(+), 117 deletions(-) create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 541c4db1fa..50114bf3df 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) -paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None)) +paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index fbaa169df6..362cda3f23 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { this->attrs_[name] = std::vector(); break; } + case proto::AttrType::LONGS: { + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from LONGS to LONGS"; + this->attrs_[name] = std::vector(); + break; + } case proto::AttrType::FLOATS: { VLOG(110) << "SetAttr: " << Type() << ", " << name << " from INTS to FLOATS"; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 58f6f48467..6c85f1577e 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -22,7 +22,7 @@ iou_similarity_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) -detection_library(density_prior_box_op SRCS density_prior_box_op.cc) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 99df15c322..1012ba3652 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { auto fixed_sizes = ctx->Attrs().Get>("fixed_sizes"); auto fixed_ratios = ctx->Attrs().Get>("fixed_ratios"); auto densities = ctx->Attrs().Get>("densities"); + bool flatten = ctx->Attrs().Get("flatten_to_2d"); PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(), "The number of fixed_sizes and densities must be equal."); size_t num_priors = 0; - if ((fixed_sizes.size() > 0) && (densities.size() > 0)) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + if (!flatten) { + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } else { + int64_t dim0 = input_dims[2] * input_dims[3] * num_priors; + ctx->SetOutputDim("Boxes", {dim0, 4}); + ctx->SetOutputDim("Variances", {dim0, 4}); } - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_priors; - dim_vec[3] = 4; - ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); - ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); } protected: @@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + ctx.GetPlace()); } }; @@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") .SetDefault(true); - + AddAttr("flatten_to_2d", + "(bool) Whether to flatten to 2D and " + "the second dim is 4.") + .SetDefault(false); AddAttr( "step_w", "Density prior boxes step across width, 0.0 for auto calculation.") diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu new file mode 100644 index 0000000000..3b7c781795 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +template +static __device__ inline T Clip(T in) { + return min(max(in, 0.), 1.); +} + +template +static __global__ void GenDensityPriorBox( + const int height, const int width, const int im_height, const int im_width, + const T offset, const T step_width, const T step_height, + const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin, + const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) { + int gidx = blockIdx.x * blockDim.x + threadIdx.x; + int gidy = blockIdx.y * blockDim.y + threadIdx.y; + int step_x = blockDim.x * gridDim.x; + int step_y = blockDim.y * gridDim.y; + + const T* width_ratio = ratios_shift; + const T* height_ratio = ratios_shift + num_priors; + const T* width_shift = ratios_shift + 2 * num_priors; + const T* height_shift = ratios_shift + 3 * num_priors; + + for (int j = gidy; j < height; j += step_y) { + for (int i = gidx; i < width * num_priors; i += step_x) { + int h = j; + int w = i / num_priors; + int k = i % num_priors; + + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + + T center_x_temp = center_x + width_shift[k]; + T center_y_temp = center_y + height_shift[k]; + + T box_width_ratio = width_ratio[k] / 2.; + T box_height_ratio = height_ratio[k] / 2.; + + T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.); + T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.); + T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.); + T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.); + + int out_offset = (j * width * num_priors + i) * 4; + out[out_offset] = is_clip ? Clip(xmin) : xmin; + out[out_offset + 1] = is_clip ? Clip(ymin) : ymin; + out[out_offset + 2] = is_clip ? Clip(xmax) : xmax; + out[out_offset + 3] = is_clip ? Clip(ymax) : ymax; + + var[out_offset] = var_xmin; + var[out_offset + 1] = var_ymin; + var[out_offset + 2] = var_xmax; + var[out_offset + 3] = var_ymax; + } + } +} + +template +class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto is_clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + int step_average = static_cast((step_width + step_height) * 0.5); + + framework::Tensor h_temp; + T* tdata = h_temp.mutable_data({num_priors * 4}, platform::CPUPlace()); + int idx = 0; + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = shift / 2. + dj * shift - step_average / 2.; + float center_y_temp = shift / 2. + di * shift - step_average / 2.; + tdata[idx] = box_width_ratio; + tdata[num_priors + idx] = box_height_ratio; + tdata[2 * num_priors + idx] = center_x_temp; + tdata[3 * num_priors + idx] = center_y_temp; + idx++; + } + } + } + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor d_temp; + framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); + + // At least use 32 threads, at most 512 threads. + // blockx is multiple of 32. + int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int gridx = (feature_width * num_priors + blockx - 1) / blockx; + dim3 threads(blockx, 1); + dim3 grids(gridx, feature_height); + + auto stream = + ctx.template device_context().stream(); + GenDensityPriorBox<<>>( + feature_height, feature_width, img_height, img_width, offset, + step_width, step_height, num_priors, d_temp.data(), is_clip, + variances[0], variances[1], variances[2], variances[3], + boxes->data(), vars->data()); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(density_prior_box, + ops::DensityPriorBoxOpCUDAKernel, + ops::DensityPriorBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index 9a52077e9c..ed2f5df80c 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { step_height = step_h; } int num_priors = 0; - if (fixed_sizes.size() > 0 && densities.size() > 0) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); - auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); + auto box_dim = vars->dims(); + boxes->Resize({feature_height, feature_width, num_priors, 4}); + auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); int step_average = static_cast((step_width + step_height) * 0.5); for (int h = 0; h < feature_height; ++h) { @@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { auto fixed_size = fixed_sizes[s]; int density = densities[s]; // Generate density prior boxes with fixed ratios. - if (fixed_ratios.size() > 0) { - for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); - for (int di = 0; di < density; ++di) { - for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; - idx++; - } + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = + center_x - step_average / 2. + shift / 2. + dj * shift; + float center_y_temp = + center_y - step_average / 2. + shift / 2. + di * shift; + e_boxes(h, w, idx, 0) = + (center_x_temp - box_width_ratio / 2.) / img_width >= 0 + ? (center_x_temp - box_width_ratio / 2.) / img_width + : 0; + e_boxes(h, w, idx, 1) = + (center_y_temp - box_height_ratio / 2.) / img_height >= 0 + ? (center_y_temp - box_height_ratio / 2.) / img_height + : 0; + e_boxes(h, w, idx, 2) = + (center_x_temp + box_width_ratio / 2.) / img_width <= 1 + ? (center_x_temp + box_width_ratio / 2.) / img_width + : 1; + e_boxes(h, w, idx, 3) = + (center_y_temp + box_height_ratio / 2.) / img_height <= 1 + ? (center_y_temp + box_height_ratio / 2.) / img_height + : 1; + idx++; } } } @@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); vars->Resize(var_dim); + boxes->Resize(box_dim); } }; // namespace operators diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3f17400a14..4843af8340 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -1029,6 +1029,7 @@ def density_prior_box(input, clip=False, steps=[0.0, 0.0], offset=0.5, + flatten_to_2d=False, name=None): """ **Density Prior Box Operator** @@ -1065,22 +1066,24 @@ def density_prior_box(input, height/weight of the input will be automatically calculated. Default: [0., 0.] offset(float): Prior boxes center offset. Default: 0.5 + flatten_to_2d(bool): Whether to flatten output prior boxes and variance + to 2D shape, the second dim is 4. Default: False. name(str): Name of the density prior box op. Default: None. Returns: tuple: A tuple with two Variable (boxes, variances) boxes: the output density prior boxes of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input, - num_priors is the total - box count of each position of input. + The layout is [H, W, num_priors, 4] when flatten_to_2d is False. + The layout is [H * W * num_priors, 4] when flatten_to_2d is True. + H is the height of input, W is the width of input, + num_priors is the total box count of each position of input. variances: the expanded variances of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_priors is the total - box count of each position of input + The layout is [H, W, num_priors, 4] when flatten_to_2d is False. + The layout is [H * W * num_priors, 4] when flatten_to_2d is True. + H is the height of input, W is the width of input + num_priors is the total box count of each position of input. Examples: @@ -1089,14 +1092,11 @@ def density_prior_box(input, box, var = fluid.layers.density_prior_box( input=conv1, image=images, - min_sizes=[100.], - max_sizes=[200.], - aspect_ratios=[1.0, 1.0 / 2.0, 2.0], - densities=[3, 4], - fixed_sizes=[50., 60.], - fixed_ratios=[1.0, 3.0, 1.0 / 3.0], - flip=True, - clip=True) + densities=[4, 2, 1], + fixed_sizes=[32.0, 64.0, 128.0], + fixed_ratios=[1.], + clip=True, + flatten_to_2d=True) """ helper = LayerHelper("density_prior_box", **locals()) dtype = helper.input_dtype() @@ -1127,14 +1127,11 @@ def density_prior_box(input, 'step_w': steps[0], 'step_h': steps[1], 'offset': offset, + 'densities': densities, + 'fixed_sizes': fixed_sizes, + 'fixed_ratios': fixed_ratios, + 'flatten_to_2d': flatten_to_2d, } - if densities is not None and len(densities) > 0: - attrs['densities'] = densities - if fixed_sizes is not None and len(fixed_sizes) > 0: - attrs['fixed_sizes'] = fixed_sizes - if fixed_ratios is not None and len(fixed_ratios) > 0: - attrs['fixed_ratios'] = fixed_ratios - box = helper.create_variable_for_type_inference(dtype) var = helper.create_variable_for_type_inference(dtype) helper.append_op( diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 982d291801..a2eca5541a 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase): class TestPriorBox(unittest.TestCase): def test_prior_box(self): - data_shape = [3, 224, 224] - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d(images, 3, 3, 2) - box, var = layers.prior_box( - input=conv1, - image=images, - min_sizes=[100.0], - aspect_ratios=[1.], - flip=True, - clip=True) - assert len(box.shape) == 4 - assert box.shape == var.shape - assert box.shape[3] == 4 + program = Program() + with program_guard(program): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.prior_box( + input=conv1, + image=images, + min_sizes=[100.0], + aspect_ratios=[1.], + flip=True, + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[3] == 4 class TestDensityPriorBox(unittest.TestCase): def test_density_prior_box(self): - data_shape = [3, 224, 224] - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d(images, 3, 3, 2) - box, var = layers.density_prior_box( - input=conv1, - image=images, - densities=[3, 4], - fixed_sizes=[50., 60.], - fixed_ratios=[1.0], - clip=True) - assert len(box.shape) == 4 - assert box.shape == var.shape - assert box.shape[3] == 4 + program = Program() + with program_guard(program): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.density_prior_box( + input=conv1, + image=images, + densities=[3, 4], + fixed_sizes=[50., 60.], + fixed_ratios=[1.0], + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[-1] == 4 class TestAnchorGenerator(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py index 79d1fd3d71..4b0bc1dcf8 100644 --- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py @@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest): 'offset': self.offset, 'densities': self.densities, 'fixed_sizes': self.fixed_sizes, - 'fixed_ratios': self.fixed_ratios + 'fixed_ratios': self.fixed_ratios, + 'flatten_to_2d': self.flatten_to_2d } self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} @@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest): self.set_data() def set_density(self): - self.densities = [] - self.fixed_sizes = [] - self.fixed_ratios = [] + self.densities = [4, 2, 1] + self.fixed_sizes = [32.0, 64.0, 128.0] + self.fixed_ratios = [1.0] + self.layer_w = 17 + self.layer_h = 17 + self.image_w = 533 + self.image_h = 533 + self.flatten_to_2d = False def init_test_params(self): - self.layer_w = 32 - self.layer_h = 32 - - self.image_w = 40 - self.image_h = 40 + self.set_density() self.step_w = float(self.image_w) / float(self.layer_w) self.step_h = float(self.image_h) / float(self.layer_h) @@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest): self.variances = [0.1, 0.1, 0.2, 0.2] self.variances = np.array(self.variances, dtype=np.float).flatten() - self.set_density() - self.clip = True self.num_priors = 0 if len(self.fixed_sizes) > 0 and len(self.densities) > 0: @@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest): (self.layer_h, self.layer_w, self.num_priors, 1)) self.out_boxes = out_boxes.astype('float32') self.out_var = out_var.astype('float32') + if self.flatten_to_2d: + self.out_boxes = self.out_boxes.reshape((-1, 4)) + self.out_var = self.out_var.reshape((-1, 4)) class TestDensityPriorBox(TestDensityPriorBoxOp): @@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp): self.densities = [3, 4] self.fixed_sizes = [1.0, 2.0] self.fixed_ratios = [1.0] + self.layer_w = 32 + self.layer_h = 32 + self.image_w = 40 + self.image_h = 40 + self.flatten_to_2d = True if __name__ == '__main__': From e950ce7148759472d67856d7c1ba7ec35fe364cd Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 05:54:20 +0000 Subject: [PATCH 064/113] test for mac --- python/paddle/fluid/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a6416326ed..ac2a7ea47e 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -119,7 +119,8 @@ def __bootstrap__(): 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] - if sysstr != 'Darwin': + if 'Darwin' not in sysstr: + print("aaaaa") read_env_flags.append('use_pinned_memory') if os.name != 'nt': From 61c5f13fcf92c18f30c05a90e3d3badd884f9340 Mon Sep 17 00:00:00 2001 From: sabreshao Date: Fri, 23 Nov 2018 14:27:39 +0800 Subject: [PATCH 065/113] Fix cmake for AMDGPU platform (#13801) * HIP cmake. Enable whole archieve build for pybind library. Disable two warning. Rollback to C++11. Link RCCL to WA gpu kernel loading issue. Update eigen to fix build failure. Add more include directories. Fix O3 build failure. Update eigen. fix tensor_util_test segment fault issue add more macro check in hip.cmake. we may consider refine hip.cmake to inherit all add_definitions() in parrent scope, in the future. Fix rocRAND load. Update eigen to fix gru_unit_op and reduce_op. Add HIP support to testing. Update eigen to support int16 and int8 in arg min and arg max. * add rocprim as cub library used by nv implementation * Reduce build time in rocprim. * Add rocprim introduction, remove useless cmake code. * Remove useless flags and format cmake file. --- CMakeLists.txt | 1 + cmake/external/eigen.cmake | 2 +- cmake/external/rocprim.cmake | 44 +++++++++++++++++++++++++++++ cmake/flags.cmake | 3 ++ cmake/generic.cmake | 26 +++++++++-------- cmake/hip.cmake | 32 +++++++++++++++++---- paddle/fluid/pybind/CMakeLists.txt | 4 +-- paddle/testing/paddle_gtest_main.cc | 2 +- 8 files changed, 94 insertions(+), 20 deletions(-) create mode 100644 cmake/external/rocprim.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 3059ab7e0e..8dcf9786e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,6 +204,7 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) +include(external/rocprim) include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f0..6aef97f212 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -17,7 +17,7 @@ if(WITH_AMD_GPU) extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake new file mode 100644 index 0000000000..914c064918 --- /dev/null +++ b/cmake/external/rocprim.cmake @@ -0,0 +1,44 @@ +if (NOT WITH_AMD_GPU) + return() +endif() + +# rocprim is "ROCm Parallel Primitives" for short. +# It is a header-only library providing HIP and HC parallel primitives +# for developing performant GPU-accelerated code on AMD ROCm platform. + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +INCLUDE(ExternalProject) + +SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim) +SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim) +SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_rocprim + GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git" + GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc + PREFIX ${ROCPRIM_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc + CMAKE_ARGS -DONLY_INSTALL=ON + CMAKE_ARGS -DBUILD_TEST=OFF + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR} + + INSTALL_DIR ${ROCPRIM_INSTALL_DIR} + ${EXTERNAL_PROJECT_LOG_ARGS} +) + +INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR}) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";") + add_library(rocprim STATIC ${dummyfile}) +else() + add_library(rocprim INTERFACE) +endif() + +add_dependencies(rocprim extern_rocprim) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 343e44ab4b..c4472040ce 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -129,6 +129,9 @@ set(COMMON_FLAGS -Wno-error=parentheses-equality # Warnings in pybind11 -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 -Wno-error=terminate # Warning in PADDLE_ENFORCE + -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 + -Wimplicit-fallthrough=0 # Warning in tinyformat.h + -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 ) set(GPU_COMMON_FLAGS diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 111627a932..7d803d00ef 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -454,25 +454,29 @@ function(hip_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) - target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a) - find_fluid_modules(${TARGET_NAME}) + target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so) + find_fluid_modules(${TARGET_NAME}) endif() - if (hip_library_DEPS) - add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() endforeach() else(hip_library_SRCS) if (hip_library_DEPS) - merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) + merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) else() - message(FATAL "Please specify source file or library in nv_library.") + message(FATAL "Please specify source file or library in nv_library.") endif() endif(hip_library_SRCS) endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bfe491bd6b..4276bc5b08 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU) endif() include_directories("/opt/rocm/include") +include_directories("/opt/rocm/hip/include") +include_directories("/opt/rocm/miopen/include") include_directories("/opt/rocm/hipblas/include") include_directories("/opt/rocm/hiprand/include") include_directories("/opt/rocm/rocrand/include") @@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust") list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" ) +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) if(WITH_DSO) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") endif(WITH_DSO) -if(WITH_DOUBLE) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE") -endif(WITH_DOUBLE) - if(WITH_TESTING) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") endif(WITH_TESTING) +if(WITH_DISTRIBUTE) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") +endif(WITH_DISTRIBUTE) + +if(WITH_GRPC) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") +endif(WITH_GRPC) + +if(NOT WITH_GOLANG) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG") +endif(NOT WITH_GOLANG) + +if(WITH_MKLDNN) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") +endif(WITH_MKLDNN) + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") + +if(NOT WITH_RDMA) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA") +endif(NOT WITH_RDMA) + + + if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb6ee2f4a5..25d241d976 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -5,8 +5,8 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + DEPS ARCHIVE_START ${PYBIND_DEPS} + ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END) else() cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 598f435461..babb862122 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -28,7 +28,7 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); #else From 445fff24dcbdce6c4b98b5631bc6c34831276fca Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 23 Nov 2018 14:40:04 +0800 Subject: [PATCH 066/113] add the bigobj option to NVCC compile fix code style --- cmake/cuda.cmake | 4 ++-- paddle/fluid/operators/beam_search_op_test.cc | 4 ++-- paddle/fluid/platform/stream_callback_manager.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 964d5fd45b..4c7e0fd3f6 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -200,9 +200,9 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") endif() else(NOT WIN32) if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") + list(APPEND CUDA_NVCC_FLAGS "-g -G --compiler-options;/bigobj") elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG --compiler-options;/bigobj") else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 6e283866ff..40b46781da 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -45,8 +45,8 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) { auto* ids_data = ids->mutable_data(place); auto* scores_data = scores->mutable_data(place); vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores({0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, - 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); + vector _scores( + {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 8dcfc4e748..ed8734c98c 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -14,11 +14,11 @@ #pragma once +#include #include #include #include #include -#include #include "paddle/fluid/platform/enforce.h" namespace paddle { From 5cd2fc9fd020444257ec6522504a3b244134439c Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 07:05:48 +0000 Subject: [PATCH 067/113] just for test --- paddle/testing/paddle_gtest_main.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 598f435461..6f10a51d18 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -31,6 +31,11 @@ int main(int argc, char** argv) { #ifdef PADDLE_WITH_CUDA new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); +#elif __clang__ + new_argv.push_back( + strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_" + "mb,allocator_strategy")); + new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); #else new_argv.push_back( strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_" From e21edb26f6e7fb364597c31a26f128c3c2710516 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 17:53:13 +0800 Subject: [PATCH 068/113] add Set/GetCPUNumThreads api --- paddle/fluid/inference/api/analysis_config.cc | 1 + paddle/fluid/inference/api/analysis_predictor.cc | 3 +-- paddle/fluid/inference/api/api_impl.cc | 3 +-- paddle/fluid/inference/api/paddle_api.h | 9 +++++++++ .../inference/tests/api/analyzer_resnet50_tester.cc | 1 + paddle/fluid/inference/tests/api/config_printer.h | 2 ++ paddle/fluid/inference/tests/api/tester_helper.h | 1 + paddle/fluid/operators/math/fc_compute.h | 4 +--- paddle/fluid/platform/cpu_helper.cc | 2 +- 9 files changed, 18 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 5ccd2dc5ab..100ee0c9d3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_num_threads_ = other.cpu_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cb14d2a260..9162ccefd8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -35,7 +35,6 @@ #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); -DECLARE_int32(paddle_num_threads); namespace paddle { @@ -67,7 +66,7 @@ bool AnalysisPredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); if (!PrepareScope(parent_scope)) { return false; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index fcbc3803d0..c3d17edea4 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); -DECLARE_int32(paddle_num_threads); namespace paddle { namespace { @@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 0a2a2a1a23..b7f7781d06 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -186,6 +186,15 @@ struct NativeConfig : public PaddlePredictor::Config { // Specify the variable's name of each input if input tensors don't follow the // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; + + // Set and get the number of cpu threads. + void SetCPUNumThreads(int cpu_num_threads) { + cpu_num_threads_ = cpu_num_threads; + } + int GetCPUNumThreads() const { return cpu_num_threads_; } + + protected: + int cpu_num_threads_{1}; // number of cpu threads for each instance. }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 2b936175ed..308a794ca3 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; + cfg->SetCPUNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index aa0c4b1d04..a803f5b3f4 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; + os << GenSpaces(num_spaces) + << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7b686045a5..fdadd59049 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index b072b4c20a..5b9953a5aa 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -17,8 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/jit_kernel.h" -DECLARE_int32(paddle_num_threads); - namespace paddle { namespace operators { namespace math { @@ -43,7 +41,7 @@ inline void FCCompute(const BlasT& blas, const int M, .template Get>(N); #ifdef PADDLE_WITH_MKLML -#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) +#pragma omp parallel for #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index f2d691b293..b737a6c38d 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) { #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); - omp_set_num_threads(num_threads); + omp_set_num_threads(real_num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif From a5c4b463c962bed48fba89d459adf82f4899d6c3 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 18:37:33 +0800 Subject: [PATCH 069/113] add SetMKLDNNThreadId api --- paddle/fluid/inference/api/analysis_predictor.cc | 8 ++++++++ paddle/fluid/inference/api/analysis_predictor.h | 2 ++ paddle/fluid/inference/api/paddle_analysis_config.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 9 ++++++--- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9162ccefd8..4633a75e5e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -159,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +void AnalysisPredictor::SetMKLDNNThreadId(int tid) { +#ifdef PADDLE_WITH_MKLDNN + platform::set_cur_thread_id(tid); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; +#endif +} + bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index cf81b7db73..9191970a3a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } + void SetMKLDNNThreadId(int tid); + protected: bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2ac736df7c..a09bd1cac2 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig { int max_batch_size = 1); bool use_tensorrt() const { return use_tensorrt_; } + void EnableMKLDNN(); // NOTE this is just for internal development, please not use it. // NOT stable yet. - void EnableMKLDNN(); bool use_mkldnn() const { return use_mkldnn_; } friend class ::paddle::AnalysisPredictor; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index fdadd59049..72703bc80b 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -216,13 +216,16 @@ void TestMultiThreadPrediction( size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { -#ifdef PADDLE_WITH_MKLDNN - platform::set_cur_thread_id(static_cast(tid) + 1); -#endif // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; auto &predictor = predictors[tid]; +#ifdef PADDLE_WITH_MKLDNN + if (use_analysis) { + static_cast(predictor.get()) + ->SetMKLDNNThreadId(static_cast(tid) + 1); + } +#endif // warmup run LOG(INFO) << "Running thread " << tid << ", warm up run..."; From e66b4c6bff74231898cbbb013627b0eb86eced0f Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 18:49:59 +0800 Subject: [PATCH 070/113] adjust tester_helper to make multi-instance multi-thread work test=develop --- paddle/fluid/inference/tests/api/tester_helper.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 72703bc80b..d21567ac19 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -207,11 +207,7 @@ void TestMultiThreadPrediction( int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; std::vector threads; - std::vector> predictors; - predictors.emplace_back(CreateTestPredictor(config, use_analysis)); - for (int tid = 1; tid < num_threads; ++tid) { - predictors.emplace_back(predictors.front()->Clone()); - } + auto main_predictor = CreateTestPredictor(config, use_analysis); size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { @@ -219,7 +215,9 @@ void TestMultiThreadPrediction( // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; - auto &predictor = predictors[tid]; + // To ensure the thread binding correctly, + // please clone inside the threadpool. + auto predictor = main_predictor->Clone(); #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) From 116979a40adf7fe7788f8cd50b9f03c57bcbba7b Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 23 Nov 2018 16:17:56 +0800 Subject: [PATCH 071/113] refine api name test=develop --- paddle/fluid/inference/api/analysis_config.cc | 3 ++- paddle/fluid/inference/api/analysis_predictor.cc | 4 ++-- paddle/fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/paddle_api.h | 14 +++++++++----- .../tests/api/analyzer_resnet50_tester.cc | 2 +- paddle/fluid/inference/tests/api/config_printer.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 2 +- 8 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 100ee0c9d3..dd75f0d9a6 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,7 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; - cpu_num_threads_ = other.cpu_num_threads_; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; @@ -73,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4633a75e5e..c132ce326c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -66,7 +66,7 @@ bool AnalysisPredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (!PrepareScope(parent_scope)) { return false; @@ -159,7 +159,7 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } -void AnalysisPredictor::SetMKLDNNThreadId(int tid) { +void AnalysisPredictor::SetMkldnnThreadID(int tid) { #ifdef PADDLE_WITH_MKLDNN platform::set_cur_thread_id(tid); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9191970a3a..db57812bc3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,7 +69,7 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } - void SetMKLDNNThreadId(int tid); + void SetMkldnnThreadID(int tid); protected: bool PrepareProgram(const std::shared_ptr &program); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c3d17edea4..66a8e51396 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -75,7 +75,7 @@ bool NativePaddlePredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index b7f7781d06..1513a4b3b4 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -187,14 +187,18 @@ struct NativeConfig : public PaddlePredictor::Config { // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; - // Set and get the number of cpu threads. - void SetCPUNumThreads(int cpu_num_threads) { - cpu_num_threads_ = cpu_num_threads; + // Set and get the number of cpu math library threads. + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; + } + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; } - int GetCPUNumThreads() const { return cpu_num_threads_; } protected: - int cpu_num_threads_{1}; // number of cpu threads for each instance. + // number of cpu math library (such as MKL, OpenBlas) threads for each + // instance. + int cpu_math_library_num_threads_{1}; }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 308a794ca3..abc63577b7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,7 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; - cfg->SetCPUNumThreads(FLAGS_paddle_num_threads); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index a803f5b3f4..4231eef722 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -54,7 +54,7 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; os << GenSpaces(num_spaces) - << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n"; + << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d21567ac19..1dc1678406 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -221,7 +221,7 @@ void TestMultiThreadPrediction( #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) - ->SetMKLDNNThreadId(static_cast(tid) + 1); + ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif From 47c4e65d608083cb4b75222bcd14c1db8bc40333 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 08:34:31 +0000 Subject: [PATCH 072/113] test=develop --- paddle/fluid/memory/allocation/retry_allocator_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index a0ce2875cb..f0b215dac2 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) { size_t thread_num = 32; size_t sleep_time = 40; - size_t extra_time = 2; + size_t extra_time = 10; // Reserve to perform more tests in the future std::vector> allocators; From c35bf3d34b43dd6cc5b96e963f8990d60a68d749 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 16:36:54 +0800 Subject: [PATCH 073/113] Fix multiclass_nms_op unit test fail in python3.6 test=develop --- .../fluid/tests/unittests/test_multiclass_nms_op.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index df0562dcc7..e35be54b63 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -145,10 +145,16 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, lod.append(nmsed_num) if nmsed_num == 0: continue + tmp_det_out = [] for c, indices in nmsed_outs.items(): for idx in indices: xmin, ymin, xmax, ymax = boxes[n][idx][:] det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + tmp_det_out.append( + [c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + sorted_det_out = sorted( + tmp_det_out, key=lambda tup: tup[0], reverse=False) + det_outs.extend(sorted_det_out) return det_outs, lod @@ -210,7 +216,7 @@ class TestMulticlassNMSOp(OpTest): class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): def set_argument(self): # Here set 2.0 to test the case there is no outputs. - # In practical use, 0.0 < score_threshold < 1.0 + # In practical use, 0.0 < score_threshold < 1.0 self.score_threshold = 2.0 From 5ca56cad1f3fdb50f7d019ae5e658b538f98aecc Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 23 Nov 2018 08:54:09 +0000 Subject: [PATCH 074/113] test=develop --- python/paddle/fluid/layers/control_flow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 9730fbf510..05138bf945 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table): def increment(x, value=1.0, in_place=True): """ - This function performs an operation that increments each value in the + This function performs an operation that increments the value in the input :math:`x` by an amount: :math:`value` as mentioned in the input - parameter. This operation is performed in-place by default. + parameter. This operation is performed in-place by default. Notice that + the number of elements in :math:`x` must be equal to 1. Args: x (Variable|list): The tensor that has the input values. @@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True): Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32') + data = fluid.layers.data(name='data', shape=[1], dtype='float32', + append_batch_size=False) data = fluid.layers.increment(x=data, value=3.0, in_place=True) """ helper = LayerHelper("increment", **locals()) From f7847ca6a304a649982c04bff4f3eec846a06c5d Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 23 Nov 2018 17:09:14 +0800 Subject: [PATCH 075/113] fix cublas warp error test=develop --- paddle/fluid/platform/dynload/cublas.cc | 3 +++ paddle/fluid/platform/dynload/cublas.h | 30 ++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc index 361d3439b8..41648c32fe 100644 --- a/paddle/fluid/platform/dynload/cublas.cc +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); #endif +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4 +CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index ff80bd525c..ced789b90d 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched); +#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(cublasGemmEx); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); + +CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode); +#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + __macro(cublasSetMathMode); \ + __macro(cublasGetMathMode); + +CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif +// APIs available after CUDA 9.1 #if CUDA_VERSION >= 9010 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx); +#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(cublasGemmBatchedEx); \ + __macro(cublasGemmStridedBatchedEx); + +CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP From 5431d5c471d32a5ea3be049a339e57262bd3b483 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 18:15:06 +0800 Subject: [PATCH 076/113] Polish code test=develop --- python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index e35be54b63..9778bd694d 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -149,7 +149,6 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, for c, indices in nmsed_outs.items(): for idx in indices: xmin, ymin, xmax, ymax = boxes[n][idx][:] - det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) tmp_det_out.append( [c, scores[n][c][idx], xmin, ymin, xmax, ymax]) sorted_det_out = sorted( From 3d100b0c927b6326e75b3e493d545ee2b0ff4f4b Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 19:14:46 +0800 Subject: [PATCH 077/113] Add Python3.6 Python3.7 compile process test=develop --- Dockerfile | 75 +++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index b36102175c..eb7bb2549e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,44 +33,49 @@ RUN apt-get update && \ automake locales clang-format swig cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ + build-essential checkinstall \ + libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y -# Install Go and glide -RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ - tar -xz -C /usr/local && \ - mkdir /root/gopath && \ - mkdir /root/gopath/bin && \ - mkdir /root/gopath/src -ENV GOROOT=/usr/local/go GOPATH=/root/gopath -# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin -# install glide -RUN curl -s -q https://glide.sh/get | sh - -# Install TensorRT -# following TensorRT.tar.gz is not the default official one, we do two miny changes: -# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, -# and its size is only one-third of the official one. -# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. -# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ - cp -rf /usr/local/TensorRT/include /usr && \ - cp -rf /usr/local/TensorRT/lib /usr - -# git credential to skip password typing -RUN git config --global credential.helper store - -# Fix locales to en_US.UTF-8 -RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter -# version util jupyter fixes this issue. - -# specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# version(1.7.1 for now), which causes building documentation failed. +COPY tools/manylinux1/build_scripts/* /root/python/ +RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0 + +# # Install Go and glide +# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + # tar -xz -C /usr/local && \ + # mkdir /root/gopath && \ + # mkdir /root/gopath/bin && \ + # mkdir /root/gopath/src +# ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# # install glide +# RUN curl -s -q https://glide.sh/get | sh + +# # Install TensorRT +# # following TensorRT.tar.gz is not the default official one, we do two miny changes: +# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# # and its size is only one-third of the official one. +# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. +# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + # tar -xz -C /usr/local && \ + # cp -rf /usr/local/TensorRT/include /usr && \ + # cp -rf /usr/local/TensorRT/lib /usr + +# # git credential to skip password typing +# RUN git config --global credential.helper store + +# # Fix locales to en_US.UTF-8 +# RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# # version util jupyter fixes this issue. + +# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# # version(1.7.1 for now), which causes building documentation failed. # RUN pip3 install -U wheel && \ # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ From 64ca3d176cc1348f0735e3e6f4fd2c18e902f43b Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 23 Nov 2018 19:18:20 +0800 Subject: [PATCH 078/113] Add bias_attr in sequence_conv_pool API. (#14553) --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/nets.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 50114bf3df..8397ae093b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 00d33b36fc..fb75ef62d0 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -250,7 +250,8 @@ def sequence_conv_pool(input, filter_size, param_attr=None, act="sigmoid", - pool_type="max"): + pool_type="max", + bias_attr=None): """ The sequence_conv_pool is composed with Sequence Convolution and Pooling. @@ -266,6 +267,11 @@ def sequence_conv_pool(input, pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling. Default :math:`max`. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, sequence_conv + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. Return: Variable: The final result after Sequence Convolution and Pooling. @@ -289,6 +295,7 @@ def sequence_conv_pool(input, num_filters=num_filters, filter_size=filter_size, param_attr=param_attr, + bias_attr=bias_attr, act=act) pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type) From e8a8f2626cc65b8dc3a91507eb233581e8e7e0e2 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 16:58:57 +0800 Subject: [PATCH 079/113] Add Python3.6 and Python3.7 support in Ubuntu Dockerfile test=develop --- Dockerfile | 200 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 118 insertions(+), 82 deletions(-) diff --git a/Dockerfile b/Dockerfile index eb7bb2549e..6f45c79f3a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,27 @@ ENV HOME /root # Add bash enhancements COPY ./paddle/scripts/docker/root/ /root/ +# Prepare packages for Python +RUN apt-get update && \ + apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ + libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev + +# Install Python3.6 +RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ + tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ + ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ + wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ + tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +# Install Python3.7 +RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ + tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ python3 python3-dev python3-pip \ @@ -34,88 +55,103 @@ RUN apt-get update && \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ build-essential checkinstall \ - libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ + libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y -COPY tools/manylinux1/build_scripts/* /root/python/ -RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0 - -# # Install Go and glide -# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ - # tar -xz -C /usr/local && \ - # mkdir /root/gopath && \ - # mkdir /root/gopath/bin && \ - # mkdir /root/gopath/src -# ENV GOROOT=/usr/local/go GOPATH=/root/gopath -# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin -# # install glide -# RUN curl -s -q https://glide.sh/get | sh - -# # Install TensorRT -# # following TensorRT.tar.gz is not the default official one, we do two miny changes: -# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, -# # and its size is only one-third of the official one. -# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. -# # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - # tar -xz -C /usr/local && \ - # cp -rf /usr/local/TensorRT/include /usr && \ - # cp -rf /usr/local/TensorRT/lib /usr - -# # git credential to skip password typing -# RUN git config --global credential.helper store - -# # Fix locales to en_US.UTF-8 -# RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter -# # version util jupyter fixes this issue. - -# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# # version(1.7.1 for now), which causes building documentation failed. -# RUN pip3 install -U wheel && \ - # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - # easy_install -U pip && \ - # pip install -U pip setuptools wheel && \ - # pip install -U docopt PyYAML sphinx==1.5.6 && \ - # pip install sphinx-rtd-theme==0.1.9 recommonmark - -# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - # pip3 install opencv-python && \ - # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - # pip install opencv-python - -# #For docstring checker -# RUN pip3 install pylint pytest astroid isort -# RUN pip install pylint pytest astroid isort LinkChecker - -# COPY ./python/requirements.txt /root/ -# RUN pip3 install -r /root/requirements.txt -# RUN pip install -r /root/requirements.txt - -# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use -# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -# RUN apt-get install -y libssl-dev libffi-dev -# RUN pip3 install certifi urllib3[secure] -# RUN pip install certifi urllib3[secure] - - -# # Install woboq_codebrowser to /woboq -# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - # (cd /woboq \ - # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - # -DCMAKE_BUILD_TYPE=Release . \ - # make) - -# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service -# RUN mkdir /var/run/sshd -# RUN echo 'root:root' | chpasswd -# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config -# EXPOSE 22 +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + +# Install TensorRT +# following TensorRT.tar.gz is not the default official one, we do two miny changes: +# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# and its size is only one-third of the official one. +# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. +RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + tar -xz -C /usr/local && \ + cp -rf /usr/local/TensorRT/include /usr && \ + cp -rf /usr/local/TensorRT/lib /usr + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# version util jupyter fixes this issue. + +# specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# version(1.7.1 for now), which causes building documentation failed. +RUN pip3.5 install -U wheel && \ + pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 install -U wheel && \ + pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 install -U wheel && \ + pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ + easy_install -U pip && \ + pip install -U pip setuptools wheel && \ + pip install -U docopt PyYAML sphinx==1.5.6 && \ + pip install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.5 install opencv-python && \ + pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 install opencv-python && \ + pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 install opencv-python && \ + pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python + +#For docstring checker +RUN pip3.5 install pylint pytest astroid isort +RUN pip3.6 install pylint pytest astroid isort +RUN pip3.7 install pylint pytest astroid isort +RUN pip install pylint pytest astroid isort LinkChecker + +COPY ./python/requirements.txt /root/ +RUN pip3.5 install -r /root/requirements.txt +RUN pip3.6 install -r /root/requirements.txt +RUN pip3.7 install -r /root/requirements.txt +RUN pip install -r /root/requirements.txt + +# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +RUN apt-get install -y libssl-dev libffi-dev +RUN pip3.5 install certifi urllib3[secure] +RUN pip3.6 install certifi urllib3[secure] +RUN pip3.7 install certifi urllib3[secure] +RUN pip install certifi urllib3[secure] + + +# Install woboq_codebrowser to /woboq +RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + (cd /woboq \ + cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + -DCMAKE_BUILD_TYPE=Release . \ + make) + +# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +RUN mkdir /var/run/sshd +RUN echo 'root:root' | chpasswd +RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +EXPOSE 22 From 155a0f78e6f8688a68dae765dee4d25e08bc614b Mon Sep 17 00:00:00 2001 From: Min Date: Sat, 24 Nov 2018 17:17:14 +0800 Subject: [PATCH 080/113] Polish code test=develop --- Dockerfile | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6f45c79f3a..9459552890 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,13 +34,13 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null # Install Python3.7 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null RUN apt-get update && \ @@ -54,8 +54,6 @@ RUN apt-get update && \ automake locales clang-format swig cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - build-essential checkinstall \ - libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y @@ -94,9 +92,9 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3.5 install -U wheel && \ - pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 install -U wheel && \ + pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 install -U wheel && \ pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ @@ -108,9 +106,9 @@ RUN pip3.5 install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.5 install opencv-python && \ +RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 install opencv-python && \ pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.6 install opencv-python && \ @@ -122,13 +120,13 @@ RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip install opencv-python #For docstring checker -RUN pip3.5 install pylint pytest astroid isort +RUN pip3 install pylint pytest astroid isort RUN pip3.6 install pylint pytest astroid isort RUN pip3.7 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3.5 install -r /root/requirements.txt +RUN pip3 install -r /root/requirements.txt RUN pip3.6 install -r /root/requirements.txt RUN pip3.7 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt @@ -136,7 +134,7 @@ RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev -RUN pip3.5 install certifi urllib3[secure] +RUN pip3 install certifi urllib3[secure] RUN pip3.6 install certifi urllib3[secure] RUN pip3.7 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] From 05e6a7141717f1eb1e73836c7aec67faea4d4db5 Mon Sep 17 00:00:00 2001 From: Min Date: Sat, 24 Nov 2018 17:19:22 +0800 Subject: [PATCH 081/113] Polish code test=develop --- .dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 49adfe4f0a..2b2e74053d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,5 @@ *.DS_Store build/ -build* *.user .vscode .idea From 8038cd10a93c66405bc7221f3d6cf1605c25df0d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 19:19:44 +0800 Subject: [PATCH 082/113] Upgrade pybind11 to v2.2.4 to support Python3.7 test=develop --- cmake/external/pybind11.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index c885877a2b..3a10ea945d 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -26,7 +26,7 @@ ExternalProject_Add( extern_pybind ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/pybind/pybind11.git" - GIT_TAG "v2.1.1" + GIT_TAG "v2.2.4" PREFIX ${PYBIND_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" From bc4923321ce286f357792c5a28884a665fcb87b7 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Sat, 24 Nov 2018 19:32:46 +0800 Subject: [PATCH 083/113] Update __init__.py --- python/paddle/fluid/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index ac2a7ea47e..ff651db043 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -120,7 +120,6 @@ def __bootstrap__(): 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] if 'Darwin' not in sysstr: - print("aaaaa") read_env_flags.append('use_pinned_memory') if os.name != 'nt': From 81994e84e055cba8b4d3fe0b1ecb94b12d731661 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 19:37:37 +0800 Subject: [PATCH 084/113] Change the include files because the version changes of pybind11 test=develop --- paddle/fluid/pybind/tensor_py.h | 1 - paddle/scripts/paddle_build.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index b39323f843..02a75236f6 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" -#include "pybind11/common.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 9632eaec00..86925b26e7 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -149,7 +149,7 @@ function cmake_gen() { elif [ "$1" == "cp37-cp37m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} - export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3 + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" fi From b67229187e67b04f6f6517cf8c0ceb7fcd8629f4 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 19:47:50 +0800 Subject: [PATCH 085/113] Change to PYBIND11_MODULE because the deprecation of PYBIND11_PLUGIN test=develop --- paddle/fluid/pybind/pybind.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 795800fd51..bf86b83d4e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -86,7 +86,7 @@ bool IsCompiledWithDIST() { #endif } -PYBIND11_PLUGIN(core) { +PYBIND11_MODULE(core) { // Not used, just make sure cpu_info.cc is linked. paddle::platform::CpuTotalPhysicalMemory(); From d2045260a5cd907238e594483daf0d2fbfa51314 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 20:07:53 +0800 Subject: [PATCH 086/113] Change visibilities of variant_visitor of pybind11 test=develop --- paddle/fluid/pybind/protobuf.cc | 13 +++++++------ paddle/fluid/pybind/pybind.cc | 5 ++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 586e92c2b3..0443ff3fc3 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -30,11 +30,12 @@ namespace pybind11 { namespace detail { // Can be replaced by a generic lambda in C++14 -struct variant_caster_visitor : public boost::static_visitor { +struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor + : public boost::static_visitor { return_value_policy policy; handle parent; - variant_caster_visitor(return_value_policy policy, handle parent) + paddle_variant_caster_visitor(return_value_policy policy, handle parent) : policy(policy), parent(parent) {} template @@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor { }; template -struct variant_caster; +struct paddle_variant_caster; template