test=develop

local_add_cudnn_lstm
JiabinYang 6 years ago
commit 3c6102a367

@ -25,6 +25,7 @@
| kexinzhao | Ke-Xin Zhao |
| kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao |
| cjld | Dun Liang |
| lipeng-unisound | Peng Li |
| liuyuan | Yuan Liu |
| livc | Zhao Li |

@ -130,6 +130,21 @@ if (APPLE OR WIN32)
"Disable MKL for building on mac and windows" FORCE)
endif()
if (WIN32)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when compiling for Windows" FORCE)
set(WITH_DSO OFF CACHE STRING
"Disable DSO when compiling for Windows" FORCE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL when compiling for Windows" FORCE)
set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE)
set(WITH_C_API OFF CACHE STRING
"Disable C_API when compiling for Windows" FORCE)
set(WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows" FORCE)
endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
@ -190,11 +205,11 @@ include(external/pybind11) # download pybind11
include(external/cares)
include(external/cub)
include(external/xxhash) # download xxhash
if (NOT WIN32)
# there is no official support of snappystream, warpctc, nccl, cupti in windows
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
if (NOT WIN32)
# there is no official support of warpctc, nccl, cupti in windows
include(external/warpctc) # download, build, install warpctc
include(cupti)
endif (NOT WIN32)

@ -50,7 +50,11 @@ IF(WITH_TESTING)
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_GMOCK=ON

@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
if (WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
else(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
endif (WIN32)
ExternalProject_Add(
extern_snappy
@ -34,8 +38,12 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON

@ -18,36 +18,45 @@ ENDIF()
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
ExternalProject_Add(
extern_snappystream
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
GIT_TAG "0.2.8"
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
DEPENDS snappy
)
if(WIN32)
# Fix me, VS2015 come without VLA support
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
else(WIN32)
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
ExternalProject_Add(
extern_snappystream
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
GIT_TAG "0.2.8"
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
DEPENDS snappy
)
endif(WIN32)
add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})

@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
if(WIN32)
target_link_libraries(${TARGET_NAME} shlwapi)
endif(WIN32)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}

@ -84,9 +84,7 @@ function(op_library TARGET)
endif()
if (WIN32)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
"crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
"fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return()
endif()

@ -57,43 +57,46 @@ int main()
return 0;
}" SSE3_FOUND)
# Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 result = _mm256_add_ps (a, b);
return 0;
}" AVX_FOUND)
# disable AVX by default on windows
if(NOT WIN32)
# Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 result = _mm256_add_ps (a, b);
return 0;
}" AVX_FOUND)
# Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a);
return 0;
}" AVX2_FOUND)
# Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a);
return 0;
}" AVX2_FOUND)
# Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a);
return 0;
}" AVX512F_FOUND)
# Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a);
return 0;
}" AVX512F_FOUND)
endif(NOT WIN32)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)

@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)

@ -3,13 +3,9 @@ add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(string)
add_subdirectory(pybind)
if (NOT WIN32)
add_subdirectory(recordio)
endif(NOT WIN32)
add_subdirectory(pybind)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
add_subdirectory(train)

@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
endfunction()
add_subdirectory(ir)
if (NOT WIN32)
add_subdirectory(details)
endif (NOT WIN32)
# ddim lib
proto_library(framework_proto SRCS framework.proto)
@ -68,11 +66,7 @@ if(WITH_GPU)
else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif()
if (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
else()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
endif (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@ -122,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
if (NOT WIN32)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
else()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor)
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@ -183,12 +172,10 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy
fast_threaded_ssa_graph_executor)
endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

@ -13,9 +13,9 @@
// limitations under the License.
#pragma once
#include <ThreadPool.h>
#include <string>
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"

@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// logging.h and windows.h conflict
#define GLOG_NO_ABBREVIATED_SEVERITIES
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/framework/tensor.h"
#include "unsupported/Eigen/CXX11/Tensor"

@ -23,11 +23,6 @@ limitations under the License. */
#include <unordered_map>
#include <unordered_set>
#if defined(_WIN32)
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#define GOOGLE_GLOG_DLL_DECL
#endif
#include "glog/logging.h" // For VLOG()
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/details/op_registry.h"

@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include <gflags/gflags.h>
#include <glog/logging.h>

@ -20,8 +20,6 @@ limitations under the License. */
#include <tuple>
#include <unordered_map>
#include <vector>
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "glog/logging.h" // For VLOG
#include "paddle/fluid/framework/attribute.h"

@ -14,12 +14,6 @@ limitations under the License. */
#pragma once
// logging.h and windows.h conflict
#define GLOG_NO_ABBREVIATED_SEVERITIES
// solve static linking error in windows
// https://github.com/google/glog/issues/301
#define GOOGLE_GLOG_DLL_DECL
#include <glog/logging.h>
#include <map>
#include <memory>

@ -15,6 +15,12 @@
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#ifdef _WIN32
#define posix_memalign_free _aligned_free
#define posix_memalign(p, a, s) \
(((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
#endif
namespace paddle {
namespace memory {
namespace allocation {

@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
add_subdirectory(distributed_ops)
endif()
if (NOT WIN32)
add_subdirectory(reader)
endif()
add_subdirectory(reader)
if (NOT WIN32)
add_subdirectory(nccl)
@ -41,25 +39,30 @@ endif()
register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
# warpctc_cudnn need cudnn 7 above
if (WITH_GPU)
# warpctc_op needs cudnn 7 above
if (WITH_GPU AND NOT WIN32)
if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()
op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
# conv_fusion_op needs cudnn 7 above
if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
endif()
else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()
set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
if (NOT WIN32)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
endif()

@ -22,6 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
namespace paddle {
namespace operators {
#if CUDNN_VERSION >= 7001
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@ -178,10 +179,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
};
#endif
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7001
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
ops::CUDNNConvFusionOpKernel<double>);
#endif

@ -0,0 +1,162 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/group_norm_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
class GroupNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of GroupNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"),
"Output(Y) of GroupNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Mean"),
"Output(Mean) of GroupNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Variance"),
"Output(Variance) of GroupNormOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto channel_num = x_dim[1];
auto batch_size = x_dim[0];
auto groups = ctx->Attrs().Get<int>("groups");
PADDLE_ENFORCE_LE(
groups, channel_num,
"'groups' must be less equal than the number of channels.");
PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
if (ctx->HasInput("Scale")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
}
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
}
ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
ctx->SetOutputDim("Mean", {batch_size, groups});
ctx->SetOutputDim("Variance", {batch_size, groups});
ctx->ShareLoD("X", "Y");
}
};
class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input tensor.");
AddInput("Scale",
"Scale is a 1-dimensional tensor of size C"
"that is applied to the output.")
.AsDispensable();
AddInput("Bias",
"Bias is a 1-dimensional tensor of size C "
"that is applied to the output")
.AsDispensable();
AddOutput("Y", "Result after normalization.");
AddOutput("Mean", "Mean of each group.").AsIntermediate();
AddOutput("Variance", "Variance of each group.").AsIntermediate();
AddAttr<float>("epsilon",
"Constant for numerical stability [default 1e-5].")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
"'epsilon' should be between 0.0 and 1.0.");
});
AddAttr<int>("groups", "The number of groups that divided from channels.")
.AddCustomChecker([](const int &groups) {
PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
});
AddComment(R"DOC(
Group Normalization
Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
)DOC");
}
};
class GroupNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
// check input
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of GroupNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"),
"Input(Mean) of GroupNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Variance"),
"Input(Variance) of GroupNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) of GroupNormOp should not be null.");
// check output
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"),
ctx->GetInputDim("Scale"));
}
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
REGISTER_OP_CPU_KERNEL(
group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
group_norm_grad,
ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);

File diff suppressed because it is too large Load Diff

@ -0,0 +1,197 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename DeviceContext, typename T>
class GroupNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Output<Tensor>("Y");
auto* mean = ctx.Output<Tensor>("Mean");
auto* var = ctx.Output<Tensor>("Variance");
const auto groups = ctx.Attr<int>("groups");
const auto x_dims = x->dims();
const int group_size = (x_dims[1] - 1) / groups + 1;
y->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto* x_data = x->data<T>();
auto* y_data = y->data<T>();
auto* mean_data = mean->data<T>();
auto* var_data = var->data<T>();
const T* scale_data = nullptr;
if (scale) scale_data = scale->data<T>();
const T* bias_data = nullptr;
if (bias) bias_data = bias->data<T>();
int imsize = x_dims[2] * x_dims[3];
auto* iter_x_data = x_data;
auto* iter_y_data = y_data;
for (int bid = 0; bid < x_dims[0]; bid++)
for (int gid = 0; gid < groups; gid++) {
T x_mean = 0, x_var = 0;
int number = std::min(group_size,
static_cast<int>(x_dims[1] - gid * group_size));
auto* tmp = iter_x_data;
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
x_mean += iter_x_data[0];
x_var += iter_x_data[0] * iter_x_data[0];
}
}
x_mean /= number * imsize;
x_var /= number * imsize;
x_var = x_var - x_mean * x_mean;
T var_inv = 1.0 / sqrt(x_var + epsilon);
mean_data[bid * groups + gid] = x_mean;
var_data[bid * groups + gid] = x_var;
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) {
T val = (tmp[0] - x_mean) * var_inv;
if (scale_data) val *= scale_data[gid * group_size + cid];
if (bias_data) val += bias_data[gid * group_size + cid];
iter_y_data[0] = val;
}
}
}
}
};
template <typename DeviceContext, typename T>
class GroupNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto* x = ctx.Input<Tensor>("X");
auto* mean = ctx.Input<Tensor>("Mean");
auto* var = ctx.Input<Tensor>("Variance");
auto* scale = ctx.Input<Tensor>("Scale");
auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto groups = ctx.Attr<int>("groups");
// init output
auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const auto& x_dims = x->dims();
const int group_size = (x_dims[1] - 1) / groups + 1;
// TODO(liangdun): need to check d_x is null
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
T* d_x_data = nullptr;
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_x, static_cast<T>(0));
d_x_data = d_x->data<T>();
}
auto* x_data = x->data<T>();
auto* y_data = d_y->data<T>();
auto* mean_data = mean->data<T>();
auto* var_data = var->data<T>();
T* d_scale_data = nullptr;
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_scale, static_cast<T>(0));
d_scale_data = d_scale->data<T>();
}
T* d_bias_data = nullptr;
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_bias, static_cast<T>(0));
d_bias_data = d_bias->data<T>();
}
const T* scale_data = nullptr;
if (scale) scale_data = scale->data<T>();
int imsize = x_dims[2] * x_dims[3];
auto* iter_x_data = x_data;
auto* iter_d_x_data = d_x_data;
auto* iter_y_data = y_data;
for (int bid = 0; bid < x_dims[0]; bid++)
for (int gid = 0; gid < groups; gid++) {
T x_mean = mean_data[bid * groups + gid];
T x_var = var_data[bid * groups + gid];
T var_inv = 1.0 / sqrt(x_var + epsilon);
int number = std::min(group_size,
static_cast<int>(x_dims[1] - gid * group_size));
auto* tmp = iter_x_data;
auto* tmp2 = iter_d_x_data;
T d_var_inv = 0, d_x_mean = 0;
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize;
imid++, tmp++, iter_y_data++, iter_d_x_data++) {
T val = (tmp[0] - x_mean) * var_inv;
T dval = iter_y_data[0];
if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
if (d_scale_data)
d_scale_data[gid * group_size + cid] += val * dval;
if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
d_var_inv += (tmp[0] - x_mean) * dval;
T d_tmp = dval * var_inv;
if (d_x_data) iter_d_x_data[0] += d_tmp;
d_x_mean -= d_tmp;
}
}
T d_x_var =
-1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
d_x_mean -= 2 * d_x_var * x_mean;
d_x_var /= number * imsize;
d_x_mean /= number * imsize;
iter_d_x_data = tmp2;
if (d_x_data) {
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize;
imid++, iter_x_data++, iter_d_x_data++) {
iter_d_x_data[0] += d_x_mean;
iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
}
}
}
}
}
};
} // namespace operators
} // namespace paddle

@ -1,6 +1,4 @@
if (NOT WIN32)
add_subdirectory(detail)
endif(NOT WIN32)
add_subdirectory(detail)
function(math_library TARGET)
# math_library is a function to create math library.
@ -43,10 +41,8 @@ math_library(depthwise_conv)
math_library(im2col)
math_library(sampler)
if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
endif (NOT WIN32)
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library(math_function DEPS blas)
@ -58,9 +54,9 @@ math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale)
math_library(softmax DEPS math_function)
if (NOT WIN32)
math_library(matrix_bit_code)
endif (NOT WIN32)
math_library(matrix_bit_code)
math_library(unpooling)
math_library(vol2col)
@ -76,13 +72,12 @@ if(WITH_GPU)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
if (NOT WIN32)
set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
if(WITH_XBYAK)
list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
list(APPEND JIT_KERNEL_DEPS xbyak)
endif()
cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
endif (NOT WIN32)
set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
if(WITH_XBYAK)
list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
list(APPEND JIT_KERNEL_DEPS xbyak)
endif()
cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)

@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <math.h>
#include <string>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save