remove patch command and file of warpctc to Improved quality of Paddle Repo (#21929)

release/1.7
zhouwei25 5 years ago committed by Tao Luo
parent b1ec1d54cf
commit 8b15acd71d

@ -14,11 +14,11 @@
INCLUDE(ExternalProject)
SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
set(WARPCTC_TAG 6d5b8fac130638862d97dc48ef43a8d7b5a503bb)
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
@ -36,14 +36,6 @@ cache_third_party(extern_warpctc
REPOSITORY ${WARPCTC_REPOSITORY}
TAG ${WARPCTC_TAG})
if(WIN32)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src)
file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst)
set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y)
else()
set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc/. ${WARPCTC_SOURCE_DIR})
endif()
ExternalProject_Add(
extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS}
@ -52,7 +44,7 @@ ExternalProject_Add(
PREFIX ${WARPCTC_PREFIX_DIR}
SOURCE_DIR ${WARPCTC_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
PATCH_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

@ -1,230 +0,0 @@
IF (APPLE)
cmake_minimum_required(VERSION 3.4)
ELSE()
cmake_minimum_required(VERSION 2.8)
ENDIF()
project(ctc_release)
include_directories(include)
FIND_PACKAGE(CUDA 6.5)
FIND_PACKAGE(Torch)
MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
MESSAGE(STATUS "Torch found ${Torch_DIR}")
option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND})
option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND})
option(WITH_OMP "compile warp-ctc with OpenMP." ON)
option(BUILD_TESTS "build warp-ctc unit tests." ON)
option(BUILD_SHARED "build warp-ctc shared library." ON)
if(BUILD_SHARED)
set(WARPCTC_SHARED "SHARED")
else(BUILD_SHARED)
set(WARPCTC_SHARED "STATIC")
endif(BUILD_SHARED)
if(WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
else(WIN32)
# Set c++ flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
endif(WIN32)
if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
add_definitions(-DAPPLE)
endif()
if(WITH_OMP AND NOT APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
else()
add_definitions(-DCTC_DISABLE_OMP)
endif()
# need to be at least 30 or __shfl_down in reduce wont compile
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
IF (CUDA_VERSION VERSION_GREATER "7.6")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
ENDIF()
IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
ENDIF()
IF(NOT APPLE AND NOT WIN32)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
if(WITH_OMP)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
endif()
ENDIF()
IF (APPLE)
EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION)
STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
#for el capitain have to use rpath
IF (DARWIN_VERSION LESS 15)
set(CMAKE_SKIP_RPATH TRUE)
ENDIF ()
ELSE()
#always skip for linux
set(CMAKE_SKIP_RPATH TRUE)
ENDIF()
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS PATH DEPS)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
endif()
# only copy the xx.cu to .xx.cu when the content are modified
set(copy_flag 1)
if (EXISTS ${final_path}/.${src}.cu)
file(READ ${final_path}/${src}.cpp SOURCE_STR)
file(READ ${final_path}/.${src}.cu TARGET_STR)
if (SOURCE_STR STREQUAL TARGET_STR)
set(copy_flag 0)
endif()
endif()
if (copy_flag)
add_custom_command(OUTPUT ${final_path}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
endif(copy_flag)
add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
endforeach()
endfunction()
IF (WITH_GPU)
MESSAGE(STATUS "Building shared library with GPU support")
MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
if (WIN32)
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
else()
CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
endif(WIN32)
IF (!WITH_TORCH)
TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
ENDIF()
if(BUILD_TESTS)
add_executable(test_cpu tests/test_cpu.cpp )
TARGET_LINK_LIBRARIES(test_cpu warpctc)
SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
cuda_add_executable(test_gpu tests/test_gpu.cu)
TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
endif(BUILD_TESTS)
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "bin"
LIBRARY DESTINATION "lib"
ARCHIVE DESTINATION "lib")
INSTALL(FILES include/ctc.h DESTINATION "include")
IF (WITH_TORCH)
MESSAGE(STATUS "Building Torch Bindings with GPU support")
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
SET(src torch_binding/binding.cpp torch_binding/utils.c)
SET(luasrc torch_binding/init.lua)
ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
IF (APPLE)
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
ELSE()
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
ENDIF()
ENDIF()
ELSE()
MESSAGE(STATUS "Building shared library with no GPU support")
if (NOT APPLE AND NOT WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
ENDIF()
ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
if(BUILD_TESTS)
add_executable(test_cpu tests/test_cpu.cpp )
TARGET_LINK_LIBRARIES(test_cpu warpctc)
SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
endif(BUILD_TESTS)
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "bin"
LIBRARY DESTINATION "lib"
ARCHIVE DESTINATION "lib")
INSTALL(FILES include/ctc.h DESTINATION "include")
IF (WITH_TORCH)
MESSAGE(STATUS "Building Torch Bindings with no GPU support")
add_definitions(-DTORCH_NOGPU)
INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
SET(src torch_binding/binding.cpp torch_binding/utils.c)
SET(luasrc torch_binding/init.lua)
ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
IF (APPLE)
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
ELSE()
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
ENDIF()
ENDIF()
ENDIF()

@ -1,160 +0,0 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef _WIN32
#ifdef warpctc_EXPORTS
#define API_REFERENCE extern "C" __declspec(dllexport)
#else
#define API_REFERENCE extern "C" __declspec(dllimport)
#endif
#else
#define API_REFERENCE
#endif
#ifdef __cplusplus
#include <cstddef>
extern "C" {
#endif
// forward declare of CUDA typedef to avoid needing to pull in CUDA headers
typedef struct CUstream_st* CUstream;
typedef enum {
CTC_STATUS_SUCCESS = 0,
CTC_STATUS_MEMOPS_FAILED = 1,
CTC_STATUS_INVALID_VALUE = 2,
CTC_STATUS_EXECUTION_FAILED = 3,
CTC_STATUS_UNKNOWN_ERROR = 4
} ctcStatus_t;
/** Returns a single integer which specifies the API version of the warpctc
* library */
API_REFERENCE int get_warpctc_version();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct ctcOptions {
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation loc;
union {
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned int num_threads;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream stream;
};
/// the label value/index that the CTC calculation should use as the blank
/// label
int blank_label;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest
* changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to
* slowest
* changing index, aka column-major). We also assume strides are
* equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch
* examples
* in the mini batch, and alphabet_size symbols in the alphabet, is
* located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float* costs,
void* workspace,
ctcOptions options);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be
*allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
ctcOptions info,
size_t* size_bytes);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,38 +0,0 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __CUDACC__
#define HOSTDEVICE __host__ __device__
#else
#define HOSTDEVICE
#endif
// NOTE(dzhwinter)
// the warp primitive is different in cuda9(Volta) GPU.
// add a wrapper to compatible with cuda7 to cuda9
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
#define DEFAULT_MASK 0u
template <typename T>
__forceinline__ __device__ T __shfl_down(T input, int delta) {
return __shfl_down_sync(DEFAULT_MASK, input, delta);
}
template <typename T>
__forceinline__ __device__ T __shfl_up(T input, int delta) {
return __shfl_up_sync(DEFAULT_MASK, input, delta);
}
#endif

@ -1,186 +0,0 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cstddef>
#include <iostream>
#include <ctc.h>
#include "detail/cpu_ctc.h"
#ifdef __CUDACC__
#include "detail/gpu_ctc.h"
#endif
extern "C" {
int get_warpctc_version() { return 2; }
const char* ctcGetStatusString(ctcStatus_t status) {
switch (status) {
case CTC_STATUS_SUCCESS:
return "no error";
case CTC_STATUS_MEMOPS_FAILED:
return "cuda memcpy or memset failed";
case CTC_STATUS_INVALID_VALUE:
return "invalid value";
case CTC_STATUS_EXECUTION_FAILED:
return "execution failed";
case CTC_STATUS_UNKNOWN_ERROR:
default:
return "unknown error";
}
}
ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float* costs,
void* workspace,
ctcOptions options) {
if (activations == nullptr || flat_labels == nullptr ||
label_lengths == nullptr || input_lengths == nullptr ||
costs == nullptr || workspace == nullptr || alphabet_size <= 0 ||
minibatch <= 0)
return CTC_STATUS_INVALID_VALUE;
if (options.loc == CTC_CPU) {
CpuCTC<float> ctc(alphabet_size,
minibatch,
workspace,
options.num_threads,
options.blank_label);
if (gradients != NULL)
return ctc.cost_and_grad(activations,
gradients,
costs,
flat_labels,
label_lengths,
input_lengths);
else
return ctc.score_forward(
activations, costs, flat_labels, label_lengths, input_lengths);
} else if (options.loc == CTC_GPU) {
#ifdef __CUDACC__
GpuCTC<float> ctc(alphabet_size,
minibatch,
workspace,
options.stream,
options.blank_label);
if (gradients != NULL)
return ctc.cost_and_grad(activations,
gradients,
costs,
flat_labels,
label_lengths,
input_lengths);
else
return ctc.score_forward(
activations, costs, flat_labels, label_lengths, input_lengths);
#else
std::cerr << "GPU execution requested, but not compiled with GPU support"
<< std::endl;
return CTC_STATUS_EXECUTION_FAILED;
#endif
} else {
return CTC_STATUS_INVALID_VALUE;
}
}
ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
ctcOptions options,
size_t* size_bytes) {
if (label_lengths == nullptr || input_lengths == nullptr ||
size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0)
return CTC_STATUS_INVALID_VALUE;
// This is the max of all S and T for all examples in the minibatch.
int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
const int S = 2 * maxL + 1;
*size_bytes = 0;
if (options.loc == CTC_GPU) {
// GPU storage
// nll_forward, nll_backward
*size_bytes += 2 * sizeof(float) * minibatch;
// repeats
*size_bytes += sizeof(int) * minibatch;
// label offsets
*size_bytes += sizeof(int) * minibatch;
// utt_length
*size_bytes += sizeof(int) * minibatch;
// label lengths
*size_bytes += sizeof(int) * minibatch;
// labels without blanks - overallocate for now
*size_bytes += sizeof(int) * maxL * minibatch;
// labels with blanks
*size_bytes += sizeof(int) * S * minibatch;
// alphas
*size_bytes += sizeof(float) * S * maxT * minibatch;
// denoms
*size_bytes += sizeof(float) * maxT * minibatch;
// probs (since we will pass in activations)
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
} else {
// cpu can eventually replace all minibatch with
// max number of concurrent threads if memory is
// really tight
// per minibatch memory
size_t per_minibatch_bytes = 0;
// output
per_minibatch_bytes += sizeof(float) * alphabet_size;
// alphas
per_minibatch_bytes += sizeof(float) * S * maxT;
// betas
per_minibatch_bytes += sizeof(float) * S;
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * S;
*size_bytes = per_minibatch_bytes * minibatch;
// probs
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
}
return CTC_STATUS_SUCCESS;
}
}

@ -1,217 +0,0 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Includes, system
// #include <stdio.h>
// #include <stdlib.h>
// Includes, cuda
// #include <cuda_runtime.h>
// #include <cublas_v2.h>
// Includes, cuda helper functions
// #include <helper_cuda.h>
// For the functors
#include "ctc.h"
#include "detail/ctc_helper.h"
const int warp_size = 32;
template <int NT, typename T, typename Rop>
struct CTAReduce;
template <int NT, typename T, typename Rop>
struct CTAReduce {
enum { Size = NT, Capacity = NT };
struct Storage {
T shared[Capacity];
};
__device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
T* s = storage.shared;
s[tid] = x;
__syncthreads();
// Fold the data in half with each pass.
#pragma unroll
for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
if (tid + offset < count && tid < offset) {
// Read from the right half and store to the left half.
x = g(x, s[offset + tid]);
s[tid] = x;
}
__syncthreads();
}
T shuff;
for (int offset = warp_size / 2; offset > 0; offset /= 2) {
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
#else
shuff = __shfl_down(x, offset);
#endif
if (tid + offset < count && tid < offset) x = g(x, shuff);
}
return x;
}
};
template <int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_rows(
Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
typedef CTAReduce<NT, T, Rop> R;
__shared__ typename R::Storage storage;
int tid = threadIdx.x;
int idx = tid;
int col = blockIdx.x;
T curr;
// Each block works on a column
if (idx < num_rows) curr = f(input[idx + col * num_rows]);
idx += NT;
while (idx < num_rows) {
curr = g(curr, f(input[idx + col * num_rows]));
idx += NT;
}
// Sum thread-totals over the CTA.
curr = R::reduce(tid, curr, storage, num_rows, g);
// Store result in out
if (tid == 0) output[col] = curr;
}
template <int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_cols(
Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
__shared__ T s[NT];
int warps_per_block = NT / warp_size;
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = threadIdx.y;
T curr;
if (row < num_rows && col < num_cols) {
curr = f(input[row + col * num_rows]);
col += blockDim.y;
while (col < num_cols) {
curr = g(curr, f(input[row + col * num_rows]));
col += blockDim.y;
}
}
s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
__syncthreads();
// Reduce
if (threadIdx.y == 0 && row < num_rows) {
#pragma unroll
for (int i = 1; i < warps_per_block && i < num_cols; ++i)
curr = g(curr, s[i + threadIdx.x * warps_per_block]);
output[row] = curr;
}
}
struct ReduceHelper {
template <typename T, typename Iof, typename Rof>
static void impl(Iof f,
Rof g,
const T* input,
T* output,
int num_rows,
int num_cols,
bool axis,
cudaStream_t stream) {
int grid_size;
if (axis) {
grid_size = num_cols;
reduce_rows<128><<<grid_size, 128, 0, stream>>>(
f, g, input, output, num_rows, num_cols);
} else {
dim3 tpb(warp_size, 128 / warp_size);
grid_size = (num_cols + warp_size - 1) / warp_size;
reduce_cols<128><<<grid_size, tpb, 0, stream>>>(
f, g, input, output, num_rows, num_cols);
}
}
};
template <typename T, typename Iof, typename Rof>
ctcStatus_t reduce(Iof f,
Rof g,
const T* input,
T* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
cudaStreamSynchronize(stream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
ctcStatus_t reduce_negate(const float* input,
float* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
return reduce(ctc_helper::negate<float>(),
ctc_helper::add<float>(),
input,
output,
rows,
cols,
axis,
stream);
}
ctcStatus_t reduce_exp(const float* input,
float* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
return reduce(ctc_helper::exponential<float>(),
ctc_helper::add<float>(),
input,
output,
rows,
cols,
axis,
stream);
}
ctcStatus_t reduce_max(const float* input,
float* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
return reduce(ctc_helper::identity<float>(),
ctc_helper::maximum<float>(),
input,
output,
rows,
cols,
axis,
stream);
}

@ -1,97 +0,0 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <numeric>
#include <random>
#include <stdexcept>
#include <vector>
#include <ctc.h>
inline void throw_on_error(ctcStatus_t status, const char* message) {
if (status != CTC_STATUS_SUCCESS) {
throw std::runtime_error(
message + (", stat = " + std::string(ctcGetStatusString(status))));
}
}
#ifdef __CUDACC__
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
inline void throw_on_error(cudaError_t error, const char* message) {
if (error) {
throw thrust::system_error(error, thrust::cuda_category(), message);
}
}
#endif
std::vector<float> genActs(int size) {
std::vector<float> arr(size);
std::mt19937 gen(0);
std::uniform_real_distribution<> dis(0, 1);
for (int i = 0; i < size; ++i) arr[i] = dis(gen);
return arr;
}
std::vector<int> genLabels(int alphabet_size, int L) {
std::vector<int> label(L);
std::mt19937 gen(1);
std::uniform_int_distribution<> dis(1, alphabet_size - 1);
for (int i = 0; i < L; ++i) {
label[i] = dis(gen);
}
// guarantee repeats for testing
if (L >= 3) {
label[L / 2] = label[L / 2 + 1];
label[L / 2 - 1] = label[L / 2];
}
return label;
}
float rel_diff(const std::vector<float>& grad,
const std::vector<float>& num_grad) {
float diff = 0.;
float tot = 0.;
for (size_t idx = 0; idx < grad.size(); ++idx) {
diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
tot += grad[idx] * grad[idx];
}
return diff / tot;
}
// Numerically stable softmax for a minibatch of 1
void softmax(const float* const acts, int alphabet_size, int T, float* probs) {
for (int t = 0; t < T; ++t) {
float max_activation = -std::numeric_limits<float>::infinity();
for (int a = 0; a < alphabet_size; ++a)
max_activation = std::max(max_activation, acts[t * alphabet_size + a]);
float denom = 0;
for (int a = 0; a < alphabet_size; ++a)
denom += std::exp(acts[t * alphabet_size + a] - max_activation);
for (int a = 0; a < alphabet_size; ++a)
probs[t * alphabet_size + a] =
std::exp(acts[t * alphabet_size + a] - max_activation) / denom;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save