remove patch command and file of warpctc to Improved quality of Paddle Repo (#21929)
parent
b1ec1d54cf
commit
8b15acd71d
@ -1,230 +0,0 @@
|
||||
IF (APPLE)
|
||||
cmake_minimum_required(VERSION 3.4)
|
||||
ELSE()
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
ENDIF()
|
||||
|
||||
project(ctc_release)
|
||||
|
||||
include_directories(include)
|
||||
|
||||
FIND_PACKAGE(CUDA 6.5)
|
||||
FIND_PACKAGE(Torch)
|
||||
|
||||
MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
|
||||
MESSAGE(STATUS "Torch found ${Torch_DIR}")
|
||||
|
||||
option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND})
|
||||
option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND})
|
||||
option(WITH_OMP "compile warp-ctc with OpenMP." ON)
|
||||
option(BUILD_TESTS "build warp-ctc unit tests." ON)
|
||||
option(BUILD_SHARED "build warp-ctc shared library." ON)
|
||||
|
||||
if(BUILD_SHARED)
|
||||
set(WARPCTC_SHARED "SHARED")
|
||||
else(BUILD_SHARED)
|
||||
set(WARPCTC_SHARED "STATIC")
|
||||
endif(BUILD_SHARED)
|
||||
|
||||
if(WIN32)
|
||||
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
|
||||
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
|
||||
foreach(flag_var
|
||||
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
|
||||
if(${flag_var} MATCHES "/MD")
|
||||
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
|
||||
endif(${flag_var} MATCHES "/MD")
|
||||
endforeach(flag_var)
|
||||
else(WIN32)
|
||||
# Set c++ flags
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
|
||||
endif(WIN32)
|
||||
|
||||
if(APPLE)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
|
||||
add_definitions(-DAPPLE)
|
||||
endif()
|
||||
|
||||
if(WITH_OMP AND NOT APPLE)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
|
||||
else()
|
||||
add_definitions(-DCTC_DISABLE_OMP)
|
||||
endif()
|
||||
|
||||
# need to be at least 30 or __shfl_down in reduce wont compile
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
|
||||
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
|
||||
|
||||
IF (CUDA_VERSION VERSION_GREATER "7.6")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
|
||||
ENDIF()
|
||||
|
||||
IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
|
||||
ENDIF()
|
||||
|
||||
IF(NOT APPLE AND NOT WIN32)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
|
||||
if(WITH_OMP)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
|
||||
endif()
|
||||
ENDIF()
|
||||
|
||||
IF (APPLE)
|
||||
EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION)
|
||||
STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
|
||||
MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
|
||||
|
||||
#for el capitain have to use rpath
|
||||
|
||||
IF (DARWIN_VERSION LESS 15)
|
||||
set(CMAKE_SKIP_RPATH TRUE)
|
||||
ENDIF ()
|
||||
|
||||
ELSE()
|
||||
#always skip for linux
|
||||
set(CMAKE_SKIP_RPATH TRUE)
|
||||
ENDIF()
|
||||
|
||||
# windows treat symbolic file as a real file, which is different with unix
|
||||
# We create a hidden file and compile it instead of origin source file.
|
||||
function(windows_symbolic TARGET)
|
||||
set(oneValueArgs "")
|
||||
set(multiValueArgs SRCS PATH DEPS)
|
||||
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
|
||||
foreach(src ${windows_symbolic_SRCS})
|
||||
get_filename_component(src ${src} NAME_WE)
|
||||
if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
|
||||
message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
|
||||
endif()
|
||||
|
||||
# only copy the xx.cu to .xx.cu when the content are modified
|
||||
set(copy_flag 1)
|
||||
if (EXISTS ${final_path}/.${src}.cu)
|
||||
file(READ ${final_path}/${src}.cpp SOURCE_STR)
|
||||
file(READ ${final_path}/.${src}.cu TARGET_STR)
|
||||
if (SOURCE_STR STREQUAL TARGET_STR)
|
||||
set(copy_flag 0)
|
||||
endif()
|
||||
endif()
|
||||
if (copy_flag)
|
||||
add_custom_command(OUTPUT ${final_path}/.${src}.cu
|
||||
COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
|
||||
COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
|
||||
COMMENT "create hidden file of ${src}.cu")
|
||||
endif(copy_flag)
|
||||
add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
IF (WITH_GPU)
|
||||
|
||||
MESSAGE(STATUS "Building shared library with GPU support")
|
||||
MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
|
||||
|
||||
if (WIN32)
|
||||
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
|
||||
windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
|
||||
CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
|
||||
else()
|
||||
CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
|
||||
endif(WIN32)
|
||||
|
||||
IF (!WITH_TORCH)
|
||||
TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
|
||||
ENDIF()
|
||||
|
||||
if(BUILD_TESTS)
|
||||
add_executable(test_cpu tests/test_cpu.cpp )
|
||||
TARGET_LINK_LIBRARIES(test_cpu warpctc)
|
||||
SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
|
||||
|
||||
cuda_add_executable(test_gpu tests/test_gpu.cu)
|
||||
TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
|
||||
endif(BUILD_TESTS)
|
||||
|
||||
INSTALL(TARGETS warpctc
|
||||
RUNTIME DESTINATION "bin"
|
||||
LIBRARY DESTINATION "lib"
|
||||
ARCHIVE DESTINATION "lib")
|
||||
|
||||
INSTALL(FILES include/ctc.h DESTINATION "include")
|
||||
|
||||
IF (WITH_TORCH)
|
||||
MESSAGE(STATUS "Building Torch Bindings with GPU support")
|
||||
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
|
||||
INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
|
||||
|
||||
TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
|
||||
INSTALL(TARGETS warpctc
|
||||
RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
|
||||
LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
|
||||
ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
|
||||
|
||||
SET(src torch_binding/binding.cpp torch_binding/utils.c)
|
||||
SET(luasrc torch_binding/init.lua)
|
||||
|
||||
ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
|
||||
IF (APPLE)
|
||||
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
|
||||
ELSE()
|
||||
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
ELSE()
|
||||
MESSAGE(STATUS "Building shared library with no GPU support")
|
||||
|
||||
if (NOT APPLE AND NOT WIN32)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
|
||||
ENDIF()
|
||||
|
||||
ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
|
||||
|
||||
if(BUILD_TESTS)
|
||||
add_executable(test_cpu tests/test_cpu.cpp )
|
||||
TARGET_LINK_LIBRARIES(test_cpu warpctc)
|
||||
SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
|
||||
endif(BUILD_TESTS)
|
||||
|
||||
INSTALL(TARGETS warpctc
|
||||
RUNTIME DESTINATION "bin"
|
||||
LIBRARY DESTINATION "lib"
|
||||
ARCHIVE DESTINATION "lib")
|
||||
|
||||
INSTALL(FILES include/ctc.h DESTINATION "include")
|
||||
|
||||
IF (WITH_TORCH)
|
||||
MESSAGE(STATUS "Building Torch Bindings with no GPU support")
|
||||
add_definitions(-DTORCH_NOGPU)
|
||||
INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
|
||||
|
||||
TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
|
||||
|
||||
INSTALL(TARGETS warpctc
|
||||
RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
|
||||
LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
|
||||
ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
|
||||
|
||||
SET(src torch_binding/binding.cpp torch_binding/utils.c)
|
||||
SET(luasrc torch_binding/init.lua)
|
||||
|
||||
ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
|
||||
IF (APPLE)
|
||||
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
|
||||
ELSE()
|
||||
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
ENDIF()
|
File diff suppressed because it is too large
Load Diff
@ -1,160 +0,0 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
/** \file ctc.h
|
||||
* Contains a simple C interface to call fast CPU and GPU based computation
|
||||
* of the CTC loss.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
#ifdef warpctc_EXPORTS
|
||||
#define API_REFERENCE extern "C" __declspec(dllexport)
|
||||
#else
|
||||
#define API_REFERENCE extern "C" __declspec(dllimport)
|
||||
#endif
|
||||
#else
|
||||
#define API_REFERENCE
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <cstddef>
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// forward declare of CUDA typedef to avoid needing to pull in CUDA headers
|
||||
typedef struct CUstream_st* CUstream;
|
||||
|
||||
typedef enum {
|
||||
CTC_STATUS_SUCCESS = 0,
|
||||
CTC_STATUS_MEMOPS_FAILED = 1,
|
||||
CTC_STATUS_INVALID_VALUE = 2,
|
||||
CTC_STATUS_EXECUTION_FAILED = 3,
|
||||
CTC_STATUS_UNKNOWN_ERROR = 4
|
||||
} ctcStatus_t;
|
||||
|
||||
/** Returns a single integer which specifies the API version of the warpctc
|
||||
* library */
|
||||
API_REFERENCE int get_warpctc_version();
|
||||
|
||||
/** Returns a string containing a description of status that was passed in
|
||||
* \param[in] status identifies which string should be returned
|
||||
* \return C style string containing the text description
|
||||
* */
|
||||
API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
|
||||
|
||||
typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation;
|
||||
|
||||
/** Structure used for options to the CTC compution. Applications
|
||||
* should zero out the array using memset and sizeof(struct
|
||||
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
|
||||
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
|
||||
* forward compatibility with added options. */
|
||||
struct ctcOptions {
|
||||
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
|
||||
ctcComputeLocation loc;
|
||||
union {
|
||||
/// used when loc == CTC_CPU, the maximum number of threads that can be used
|
||||
unsigned int num_threads;
|
||||
|
||||
/// used when loc == CTC_GPU, which stream the kernels should be launched in
|
||||
CUstream stream;
|
||||
};
|
||||
|
||||
/// the label value/index that the CTC calculation should use as the blank
|
||||
/// label
|
||||
int blank_label;
|
||||
};
|
||||
|
||||
/** Compute the connectionist temporal classification loss between a sequence
|
||||
* of probabilities and a ground truth labeling. Optionally compute the
|
||||
* gradient with respect to the inputs.
|
||||
* \param [in] activations pointer to the activations in either CPU or GPU
|
||||
* addressable memory, depending on info. We assume a fixed
|
||||
* memory layout for this 3 dimensional tensor, which has dimension
|
||||
* (t, n, p), where t is the time index, n is the minibatch index,
|
||||
* and p indexes over probabilities of each symbol in the alphabet.
|
||||
* The memory layout is (t, n, p) in C order (slowest to fastest
|
||||
* changing
|
||||
* index, aka row-major), or (p, n, t) in Fortran order (fastest to
|
||||
* slowest
|
||||
* changing index, aka column-major). We also assume strides are
|
||||
* equal to
|
||||
* dimensions - there is no padding between dimensions.
|
||||
* More precisely, element (t, n, p), for a problem with mini_batch
|
||||
* examples
|
||||
* in the mini batch, and alphabet_size symbols in the alphabet, is
|
||||
* located at:
|
||||
* activations[(t * mini_batch + n) * alphabet_size + p]
|
||||
* \param [out] gradients if not NULL, then gradients are computed. Should be
|
||||
* allocated in the same memory space as probs and memory
|
||||
* ordering is identical.
|
||||
* \param [in] flat_labels Always in CPU memory. A concatenation
|
||||
* of all the labels for the minibatch.
|
||||
* \param [in] label_lengths Always in CPU memory. The length of each label
|
||||
* for each example in the minibatch.
|
||||
* \param [in] input_lengths Always in CPU memory. The number of time steps
|
||||
* for each sequence in the minibatch.
|
||||
* \param [in] alphabet_size The number of possible output symbols. There
|
||||
* should be this many probabilities for each time step.
|
||||
* \param [in] mini_batch How many examples in a minibatch.
|
||||
* \param [out] costs Always in CPU memory. The cost of each example in the
|
||||
* minibatch.
|
||||
* \param [in,out] workspace In same memory space as probs. Should be of
|
||||
* size requested by get_workspace_size.
|
||||
* \param [in] options see struct ctcOptions
|
||||
*
|
||||
* \return Status information
|
||||
*
|
||||
* */
|
||||
API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
|
||||
float* gradients,
|
||||
const int* const flat_labels,
|
||||
const int* const label_lengths,
|
||||
const int* const input_lengths,
|
||||
int alphabet_size,
|
||||
int minibatch,
|
||||
float* costs,
|
||||
void* workspace,
|
||||
ctcOptions options);
|
||||
|
||||
/** For a given set of labels and minibatch size return the required workspace
|
||||
* size. This will need to be allocated in the same memory space as your
|
||||
* probabilities.
|
||||
* \param [in] label_lengths Always in CPU memory. The length of each label
|
||||
* for each example in the minibatch.
|
||||
* \param [in] input_lengths Always in CPU memory. The number of time steps
|
||||
* for each sequence in the minibatch.
|
||||
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
|
||||
* the number of probabilities at each time step
|
||||
* \param [in] mini_batch How many examples in a minibatch.
|
||||
* \param [in] info see struct ctcOptions
|
||||
* \param [out] size_bytes is pointer to a scalar where the memory
|
||||
* requirement in bytes will be placed. This memory should be
|
||||
*allocated
|
||||
* at the same place, CPU or GPU, that the probs are in
|
||||
*
|
||||
* \return Status information
|
||||
**/
|
||||
API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
|
||||
const int* const input_lengths,
|
||||
int alphabet_size,
|
||||
int minibatch,
|
||||
ctcOptions info,
|
||||
size_t* size_bytes);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,38 +0,0 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#define HOSTDEVICE __host__ __device__
|
||||
#else
|
||||
#define HOSTDEVICE
|
||||
#endif
|
||||
|
||||
// NOTE(dzhwinter)
|
||||
// the warp primitive is different in cuda9(Volta) GPU.
|
||||
// add a wrapper to compatible with cuda7 to cuda9
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
|
||||
#define DEFAULT_MASK 0u
|
||||
template <typename T>
|
||||
__forceinline__ __device__ T __shfl_down(T input, int delta) {
|
||||
return __shfl_down_sync(DEFAULT_MASK, input, delta);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__forceinline__ __device__ T __shfl_up(T input, int delta) {
|
||||
return __shfl_up_sync(DEFAULT_MASK, input, delta);
|
||||
}
|
||||
|
||||
#endif
|
@ -1,186 +0,0 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <iostream>
|
||||
|
||||
#include <ctc.h>
|
||||
|
||||
#include "detail/cpu_ctc.h"
|
||||
#ifdef __CUDACC__
|
||||
#include "detail/gpu_ctc.h"
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
int get_warpctc_version() { return 2; }
|
||||
|
||||
const char* ctcGetStatusString(ctcStatus_t status) {
|
||||
switch (status) {
|
||||
case CTC_STATUS_SUCCESS:
|
||||
return "no error";
|
||||
case CTC_STATUS_MEMOPS_FAILED:
|
||||
return "cuda memcpy or memset failed";
|
||||
case CTC_STATUS_INVALID_VALUE:
|
||||
return "invalid value";
|
||||
case CTC_STATUS_EXECUTION_FAILED:
|
||||
return "execution failed";
|
||||
|
||||
case CTC_STATUS_UNKNOWN_ERROR:
|
||||
default:
|
||||
return "unknown error";
|
||||
}
|
||||
}
|
||||
|
||||
ctcStatus_t compute_ctc_loss(const float* const activations,
|
||||
float* gradients,
|
||||
const int* const flat_labels,
|
||||
const int* const label_lengths,
|
||||
const int* const input_lengths,
|
||||
int alphabet_size,
|
||||
int minibatch,
|
||||
float* costs,
|
||||
void* workspace,
|
||||
ctcOptions options) {
|
||||
if (activations == nullptr || flat_labels == nullptr ||
|
||||
label_lengths == nullptr || input_lengths == nullptr ||
|
||||
costs == nullptr || workspace == nullptr || alphabet_size <= 0 ||
|
||||
minibatch <= 0)
|
||||
return CTC_STATUS_INVALID_VALUE;
|
||||
|
||||
if (options.loc == CTC_CPU) {
|
||||
CpuCTC<float> ctc(alphabet_size,
|
||||
minibatch,
|
||||
workspace,
|
||||
options.num_threads,
|
||||
options.blank_label);
|
||||
|
||||
if (gradients != NULL)
|
||||
return ctc.cost_and_grad(activations,
|
||||
gradients,
|
||||
costs,
|
||||
flat_labels,
|
||||
label_lengths,
|
||||
input_lengths);
|
||||
else
|
||||
return ctc.score_forward(
|
||||
activations, costs, flat_labels, label_lengths, input_lengths);
|
||||
} else if (options.loc == CTC_GPU) {
|
||||
#ifdef __CUDACC__
|
||||
GpuCTC<float> ctc(alphabet_size,
|
||||
minibatch,
|
||||
workspace,
|
||||
options.stream,
|
||||
options.blank_label);
|
||||
|
||||
if (gradients != NULL)
|
||||
return ctc.cost_and_grad(activations,
|
||||
gradients,
|
||||
costs,
|
||||
flat_labels,
|
||||
label_lengths,
|
||||
input_lengths);
|
||||
else
|
||||
return ctc.score_forward(
|
||||
activations, costs, flat_labels, label_lengths, input_lengths);
|
||||
#else
|
||||
std::cerr << "GPU execution requested, but not compiled with GPU support"
|
||||
<< std::endl;
|
||||
return CTC_STATUS_EXECUTION_FAILED;
|
||||
#endif
|
||||
} else {
|
||||
return CTC_STATUS_INVALID_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
ctcStatus_t get_workspace_size(const int* const label_lengths,
|
||||
const int* const input_lengths,
|
||||
int alphabet_size,
|
||||
int minibatch,
|
||||
ctcOptions options,
|
||||
size_t* size_bytes) {
|
||||
if (label_lengths == nullptr || input_lengths == nullptr ||
|
||||
size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0)
|
||||
return CTC_STATUS_INVALID_VALUE;
|
||||
|
||||
// This is the max of all S and T for all examples in the minibatch.
|
||||
int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
|
||||
int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
|
||||
|
||||
const int S = 2 * maxL + 1;
|
||||
|
||||
*size_bytes = 0;
|
||||
|
||||
if (options.loc == CTC_GPU) {
|
||||
// GPU storage
|
||||
// nll_forward, nll_backward
|
||||
*size_bytes += 2 * sizeof(float) * minibatch;
|
||||
|
||||
// repeats
|
||||
*size_bytes += sizeof(int) * minibatch;
|
||||
|
||||
// label offsets
|
||||
*size_bytes += sizeof(int) * minibatch;
|
||||
|
||||
// utt_length
|
||||
*size_bytes += sizeof(int) * minibatch;
|
||||
|
||||
// label lengths
|
||||
*size_bytes += sizeof(int) * minibatch;
|
||||
|
||||
// labels without blanks - overallocate for now
|
||||
*size_bytes += sizeof(int) * maxL * minibatch;
|
||||
|
||||
// labels with blanks
|
||||
*size_bytes += sizeof(int) * S * minibatch;
|
||||
|
||||
// alphas
|
||||
*size_bytes += sizeof(float) * S * maxT * minibatch;
|
||||
|
||||
// denoms
|
||||
*size_bytes += sizeof(float) * maxT * minibatch;
|
||||
|
||||
// probs (since we will pass in activations)
|
||||
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
|
||||
|
||||
} else {
|
||||
// cpu can eventually replace all minibatch with
|
||||
// max number of concurrent threads if memory is
|
||||
// really tight
|
||||
|
||||
// per minibatch memory
|
||||
size_t per_minibatch_bytes = 0;
|
||||
|
||||
// output
|
||||
per_minibatch_bytes += sizeof(float) * alphabet_size;
|
||||
|
||||
// alphas
|
||||
per_minibatch_bytes += sizeof(float) * S * maxT;
|
||||
|
||||
// betas
|
||||
per_minibatch_bytes += sizeof(float) * S;
|
||||
|
||||
// labels w/blanks, e_inc, s_inc
|
||||
per_minibatch_bytes += 3 * sizeof(int) * S;
|
||||
|
||||
*size_bytes = per_minibatch_bytes * minibatch;
|
||||
|
||||
// probs
|
||||
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
|
||||
}
|
||||
|
||||
return CTC_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
@ -1,217 +0,0 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Includes, system
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
|
||||
// Includes, cuda
|
||||
// #include <cuda_runtime.h>
|
||||
// #include <cublas_v2.h>
|
||||
|
||||
// Includes, cuda helper functions
|
||||
// #include <helper_cuda.h>
|
||||
|
||||
// For the functors
|
||||
#include "ctc.h"
|
||||
#include "detail/ctc_helper.h"
|
||||
|
||||
const int warp_size = 32;
|
||||
|
||||
template <int NT, typename T, typename Rop>
|
||||
struct CTAReduce;
|
||||
|
||||
template <int NT, typename T, typename Rop>
|
||||
struct CTAReduce {
|
||||
enum { Size = NT, Capacity = NT };
|
||||
struct Storage {
|
||||
T shared[Capacity];
|
||||
};
|
||||
|
||||
__device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
|
||||
T* s = storage.shared;
|
||||
s[tid] = x;
|
||||
__syncthreads();
|
||||
|
||||
// Fold the data in half with each pass.
|
||||
#pragma unroll
|
||||
for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
|
||||
if (tid + offset < count && tid < offset) {
|
||||
// Read from the right half and store to the left half.
|
||||
x = g(x, s[offset + tid]);
|
||||
s[tid] = x;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
T shuff;
|
||||
for (int offset = warp_size / 2; offset > 0; offset /= 2) {
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
|
||||
#else
|
||||
shuff = __shfl_down(x, offset);
|
||||
#endif
|
||||
if (tid + offset < count && tid < offset) x = g(x, shuff);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <int NT, typename Iop, typename Rop, typename T>
|
||||
__global__ void reduce_rows(
|
||||
Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
|
||||
typedef CTAReduce<NT, T, Rop> R;
|
||||
__shared__ typename R::Storage storage;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int idx = tid;
|
||||
int col = blockIdx.x;
|
||||
T curr;
|
||||
|
||||
// Each block works on a column
|
||||
if (idx < num_rows) curr = f(input[idx + col * num_rows]);
|
||||
idx += NT;
|
||||
|
||||
while (idx < num_rows) {
|
||||
curr = g(curr, f(input[idx + col * num_rows]));
|
||||
idx += NT;
|
||||
}
|
||||
|
||||
// Sum thread-totals over the CTA.
|
||||
curr = R::reduce(tid, curr, storage, num_rows, g);
|
||||
|
||||
// Store result in out
|
||||
if (tid == 0) output[col] = curr;
|
||||
}
|
||||
|
||||
template <int NT, typename Iop, typename Rop, typename T>
|
||||
__global__ void reduce_cols(
|
||||
Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
|
||||
__shared__ T s[NT];
|
||||
|
||||
int warps_per_block = NT / warp_size;
|
||||
int row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int col = threadIdx.y;
|
||||
T curr;
|
||||
|
||||
if (row < num_rows && col < num_cols) {
|
||||
curr = f(input[row + col * num_rows]);
|
||||
col += blockDim.y;
|
||||
while (col < num_cols) {
|
||||
curr = g(curr, f(input[row + col * num_rows]));
|
||||
col += blockDim.y;
|
||||
}
|
||||
}
|
||||
s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
|
||||
__syncthreads();
|
||||
|
||||
// Reduce
|
||||
if (threadIdx.y == 0 && row < num_rows) {
|
||||
#pragma unroll
|
||||
for (int i = 1; i < warps_per_block && i < num_cols; ++i)
|
||||
curr = g(curr, s[i + threadIdx.x * warps_per_block]);
|
||||
output[row] = curr;
|
||||
}
|
||||
}
|
||||
|
||||
struct ReduceHelper {
|
||||
template <typename T, typename Iof, typename Rof>
|
||||
static void impl(Iof f,
|
||||
Rof g,
|
||||
const T* input,
|
||||
T* output,
|
||||
int num_rows,
|
||||
int num_cols,
|
||||
bool axis,
|
||||
cudaStream_t stream) {
|
||||
int grid_size;
|
||||
|
||||
if (axis) {
|
||||
grid_size = num_cols;
|
||||
reduce_rows<128><<<grid_size, 128, 0, stream>>>(
|
||||
f, g, input, output, num_rows, num_cols);
|
||||
|
||||
} else {
|
||||
dim3 tpb(warp_size, 128 / warp_size);
|
||||
grid_size = (num_cols + warp_size - 1) / warp_size;
|
||||
reduce_cols<128><<<grid_size, tpb, 0, stream>>>(
|
||||
f, g, input, output, num_rows, num_cols);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Iof, typename Rof>
|
||||
ctcStatus_t reduce(Iof f,
|
||||
Rof g,
|
||||
const T* input,
|
||||
T* output,
|
||||
int rows,
|
||||
int cols,
|
||||
bool axis,
|
||||
cudaStream_t stream) {
|
||||
ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
|
||||
|
||||
return CTC_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ctcStatus_t reduce_negate(const float* input,
|
||||
float* output,
|
||||
int rows,
|
||||
int cols,
|
||||
bool axis,
|
||||
cudaStream_t stream) {
|
||||
return reduce(ctc_helper::negate<float>(),
|
||||
ctc_helper::add<float>(),
|
||||
input,
|
||||
output,
|
||||
rows,
|
||||
cols,
|
||||
axis,
|
||||
stream);
|
||||
}
|
||||
|
||||
ctcStatus_t reduce_exp(const float* input,
|
||||
float* output,
|
||||
int rows,
|
||||
int cols,
|
||||
bool axis,
|
||||
cudaStream_t stream) {
|
||||
return reduce(ctc_helper::exponential<float>(),
|
||||
ctc_helper::add<float>(),
|
||||
input,
|
||||
output,
|
||||
rows,
|
||||
cols,
|
||||
axis,
|
||||
stream);
|
||||
}
|
||||
|
||||
ctcStatus_t reduce_max(const float* input,
|
||||
float* output,
|
||||
int rows,
|
||||
int cols,
|
||||
bool axis,
|
||||
cudaStream_t stream) {
|
||||
return reduce(ctc_helper::identity<float>(),
|
||||
ctc_helper::maximum<float>(),
|
||||
input,
|
||||
output,
|
||||
rows,
|
||||
cols,
|
||||
axis,
|
||||
stream);
|
||||
}
|
@ -1,97 +0,0 @@
|
||||
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#include <ctc.h>
|
||||
|
||||
inline void throw_on_error(ctcStatus_t status, const char* message) {
|
||||
if (status != CTC_STATUS_SUCCESS) {
|
||||
throw std::runtime_error(
|
||||
message + (", stat = " + std::string(ctcGetStatusString(status))));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
|
||||
inline void throw_on_error(cudaError_t error, const char* message) {
|
||||
if (error) {
|
||||
throw thrust::system_error(error, thrust::cuda_category(), message);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
std::vector<float> genActs(int size) {
|
||||
std::vector<float> arr(size);
|
||||
std::mt19937 gen(0);
|
||||
std::uniform_real_distribution<> dis(0, 1);
|
||||
for (int i = 0; i < size; ++i) arr[i] = dis(gen);
|
||||
return arr;
|
||||
}
|
||||
|
||||
std::vector<int> genLabels(int alphabet_size, int L) {
|
||||
std::vector<int> label(L);
|
||||
|
||||
std::mt19937 gen(1);
|
||||
std::uniform_int_distribution<> dis(1, alphabet_size - 1);
|
||||
|
||||
for (int i = 0; i < L; ++i) {
|
||||
label[i] = dis(gen);
|
||||
}
|
||||
// guarantee repeats for testing
|
||||
if (L >= 3) {
|
||||
label[L / 2] = label[L / 2 + 1];
|
||||
label[L / 2 - 1] = label[L / 2];
|
||||
}
|
||||
return label;
|
||||
}
|
||||
|
||||
float rel_diff(const std::vector<float>& grad,
|
||||
const std::vector<float>& num_grad) {
|
||||
float diff = 0.;
|
||||
float tot = 0.;
|
||||
for (size_t idx = 0; idx < grad.size(); ++idx) {
|
||||
diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
|
||||
tot += grad[idx] * grad[idx];
|
||||
}
|
||||
|
||||
return diff / tot;
|
||||
}
|
||||
|
||||
// Numerically stable softmax for a minibatch of 1
|
||||
void softmax(const float* const acts, int alphabet_size, int T, float* probs) {
|
||||
for (int t = 0; t < T; ++t) {
|
||||
float max_activation = -std::numeric_limits<float>::infinity();
|
||||
|
||||
for (int a = 0; a < alphabet_size; ++a)
|
||||
max_activation = std::max(max_activation, acts[t * alphabet_size + a]);
|
||||
|
||||
float denom = 0;
|
||||
for (int a = 0; a < alphabet_size; ++a)
|
||||
denom += std::exp(acts[t * alphabet_size + a] - max_activation);
|
||||
|
||||
for (int a = 0; a < alphabet_size; ++a)
|
||||
probs[t * alphabet_size + a] =
|
||||
std::exp(acts[t * alphabet_size + a] - max_activation) / denom;
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue