remove patch command and file of warpctc to Improved quality of Paddle Repo (#21929)

6 years ago · 8b15acd71d
parent b1ec1d54cf
commit 8b15acd71d
13 changed files with 5 additions and 3960 deletions
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -14,11 +14,11 @@

 INCLUDE(ExternalProject)

-SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
-SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
+SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
-set(WARPCTC_TAG        6d5b8fac130638862d97dc48ef43a8d7b5a503bb)
+set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc)
+set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@ -36,14 +36,6 @@ cache_third_party(extern_warpctc
    REPOSITORY   ${WARPCTC_REPOSITORY}
    TAG          ${WARPCTC_TAG})

-if(WIN32)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src)
-    file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst)
-    set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y)
-else()
-    set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc/. ${WARPCTC_SOURCE_DIR})
-endif()
-
 ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
@ -52,7 +44,7 @@ ExternalProject_Add(
    PREFIX          ${WARPCTC_PREFIX_DIR}
    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
    UPDATE_COMMAND  ""
-    PATCH_COMMAND   ${WARPCTC_PATCH_COMMAND}
+    PATCH_COMMAND   ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
--- a/patches/warpctc/CMakeLists.txt
+++ b/patches/warpctc/CMakeLists.txt
@ -1,230 +0,0 @@
-IF (APPLE)
-    cmake_minimum_required(VERSION 3.4)
-ELSE()
-    cmake_minimum_required(VERSION 2.8)
-ENDIF()
-
-project(ctc_release)
-
-include_directories(include)
-
-FIND_PACKAGE(CUDA 6.5)
-FIND_PACKAGE(Torch)
-
-MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
-MESSAGE(STATUS "Torch found ${Torch_DIR}")
-
-option(WITH_GPU     "compile warp-ctc with CUDA."     ${CUDA_FOUND})
-option(WITH_TORCH   "compile warp-ctc with Torch."    ${Torch_FOUND})
-option(WITH_OMP     "compile warp-ctc with OpenMP."   ON)
-option(BUILD_TESTS  "build warp-ctc unit tests."      ON)
-option(BUILD_SHARED "build warp-ctc shared library."  ON)
-
-if(BUILD_SHARED)
-    set(WARPCTC_SHARED "SHARED")
-else(BUILD_SHARED)
-    set(WARPCTC_SHARED "STATIC")
-endif(BUILD_SHARED)
-
-if(WIN32)
-    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-    foreach(flag_var
-            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
-        if(${flag_var} MATCHES "/MD")
-            string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-        endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-else(WIN32)
-    # Set c++ flags
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
-endif(WIN32)
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    add_definitions(-DAPPLE)
-endif()
-
-if(WITH_OMP AND NOT APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-else()
-    add_definitions(-DCTC_DISABLE_OMP)
-endif()
-
-# need to be at least 30 or __shfl_down in reduce wont compile
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
-
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
-
-IF (CUDA_VERSION VERSION_GREATER "7.6")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
-ENDIF()
-
-IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
-ENDIF()
-
-IF(NOT APPLE AND NOT WIN32)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-    if(WITH_OMP)
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
-    endif()
-ENDIF()
-
-IF (APPLE)
-    EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
-
-    #for el capitain have to use rpath
-
-    IF (DARWIN_VERSION LESS 15)
-        set(CMAKE_SKIP_RPATH TRUE)
-    ENDIF ()
-
-ELSE()
-    #always skip for linux
-    set(CMAKE_SKIP_RPATH TRUE)
-ENDIF()
-
-# windows treat symbolic file as a real file, which is different with unix
-# We create a hidden file and compile it instead of origin source file.
-function(windows_symbolic TARGET)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS PATH DEPS)
-    cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
-    foreach(src ${windows_symbolic_SRCS})
-        get_filename_component(src ${src} NAME_WE)
-        if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
-            message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
-        endif()
-
-        # only copy the xx.cu to .xx.cu when the content are modified
-        set(copy_flag 1)
-        if (EXISTS ${final_path}/.${src}.cu)
-            file(READ ${final_path}/${src}.cpp SOURCE_STR)
-            file(READ ${final_path}/.${src}.cu TARGET_STR)
-            if (SOURCE_STR STREQUAL TARGET_STR)
-                set(copy_flag 0)
-            endif()
-        endif()
-        if (copy_flag)
-            add_custom_command(OUTPUT ${final_path}/.${src}.cu
-                    COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
-                    COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
-                    COMMENT "create hidden file of ${src}.cu")
-        endif(copy_flag)
-        add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
-    endforeach()
-endfunction()
-
-IF (WITH_GPU)
-
-    MESSAGE(STATUS "Building shared library with GPU support")
-    MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
-
-    if (WIN32)
-        SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
-        windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
-        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
-    else()
-        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
-    endif(WIN32)
-
-    IF (!WITH_TORCH)
-        TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
-    ENDIF()
-
-    if(BUILD_TESTS)
-        add_executable(test_cpu tests/test_cpu.cpp )
-        TARGET_LINK_LIBRARIES(test_cpu warpctc)
-        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-
-        cuda_add_executable(test_gpu tests/test_gpu.cu)
-        TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
-    endif(BUILD_TESTS)
-
-    INSTALL(TARGETS warpctc
-            RUNTIME DESTINATION "bin"
-            LIBRARY DESTINATION "lib"
-            ARCHIVE DESTINATION "lib")
-
-    INSTALL(FILES include/ctc.h DESTINATION "include")
-
-    IF (WITH_TORCH)
-        MESSAGE(STATUS "Building Torch Bindings with GPU support")
-        INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
-        INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
-
-        TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
-        INSTALL(TARGETS warpctc
-                RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
-                LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
-                ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
-
-        SET(src torch_binding/binding.cpp torch_binding/utils.c)
-        SET(luasrc torch_binding/init.lua)
-
-        ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
-        IF (APPLE)
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
-        ELSE()
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
-        ENDIF()
-    ENDIF()
-
-ELSE()
-    MESSAGE(STATUS "Building shared library with no GPU support")
-
-    if (NOT APPLE AND NOT WIN32)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
-    ENDIF()
-
-    ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
-
-    if(BUILD_TESTS)
-        add_executable(test_cpu tests/test_cpu.cpp )
-        TARGET_LINK_LIBRARIES(test_cpu warpctc)
-        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-    endif(BUILD_TESTS)
-
-    INSTALL(TARGETS warpctc
-            RUNTIME DESTINATION "bin"
-            LIBRARY DESTINATION "lib"
-            ARCHIVE DESTINATION "lib")
-
-    INSTALL(FILES include/ctc.h DESTINATION "include")
-
-    IF (WITH_TORCH)
-        MESSAGE(STATUS "Building Torch Bindings with no GPU support")
-        add_definitions(-DTORCH_NOGPU)
-        INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
-
-        TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
-
-        INSTALL(TARGETS warpctc
-                RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
-                LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
-                ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
-
-        SET(src torch_binding/binding.cpp torch_binding/utils.c)
-        SET(luasrc torch_binding/init.lua)
-
-        ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
-        IF (APPLE)
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
-        ELSE()
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
-        ENDIF()
-    ENDIF()
-
-ENDIF()
--- a/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
--- a/patches/warpctc/include/ctc.h
+++ b/patches/warpctc/include/ctc.h
@ -1,160 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/** \file ctc.h
- * Contains a simple C interface to call fast CPU and GPU based computation
- * of the CTC loss.
- */
-
-#pragma once
-
-#ifdef _WIN32
-#ifdef warpctc_EXPORTS
-#define API_REFERENCE extern "C" __declspec(dllexport)
-#else
-#define API_REFERENCE extern "C" __declspec(dllimport)
-#endif
-#else
-#define API_REFERENCE
-#endif
-
-#ifdef __cplusplus
-#include <cstddef>
-extern "C" {
-#endif
-
-// forward declare of CUDA typedef to avoid needing to pull in CUDA headers
-typedef struct CUstream_st* CUstream;
-
-typedef enum {
-  CTC_STATUS_SUCCESS = 0,
-  CTC_STATUS_MEMOPS_FAILED = 1,
-  CTC_STATUS_INVALID_VALUE = 2,
-  CTC_STATUS_EXECUTION_FAILED = 3,
-  CTC_STATUS_UNKNOWN_ERROR = 4
-} ctcStatus_t;
-
-/** Returns a single integer which specifies the API version of the warpctc
- * library */
-API_REFERENCE int get_warpctc_version();
-
-/** Returns a string containing a description of status that was passed in
- *  \param[in] status identifies which string should be returned
- *  \return C style string containing the text description
- *  */
-API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
-
-typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation;
-
-/** Structure used for options to the CTC compution.  Applications
- *  should zero out the array using memset and sizeof(struct
- *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
- *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
- *  forward compatibility with added options. */
-struct ctcOptions {
-  /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
-  ctcComputeLocation loc;
-  union {
-    /// used when loc == CTC_CPU, the maximum number of threads that can be used
-    unsigned int num_threads;
-
-    /// used when loc == CTC_GPU, which stream the kernels should be launched in
-    CUstream stream;
-  };
-
-  /// the label value/index that the CTC calculation should use as the blank
-  /// label
-  int blank_label;
-};
-
-/** Compute the connectionist temporal classification loss between a sequence
- *  of probabilities and a ground truth labeling.  Optionally compute the
- *  gradient with respect to the inputs.
- * \param [in] activations pointer to the activations in either CPU or GPU
- *             addressable memory, depending on info.  We assume a fixed
- *             memory layout for this 3 dimensional tensor, which has dimension
- *             (t, n, p), where t is the time index, n is the minibatch index,
- *             and p indexes over probabilities of each symbol in the alphabet.
- *             The memory layout is (t, n, p) in C order (slowest to fastest
- * changing
- *             index, aka row-major), or (p, n, t) in Fortran order (fastest to
- * slowest
- *             changing index, aka column-major). We also assume strides are
- * equal to
- *             dimensions - there is no padding between dimensions.
- *             More precisely, element (t, n, p), for a problem with mini_batch
- * examples
- *             in the mini batch, and alphabet_size symbols in the alphabet, is
- * located at:
- *             activations[(t * mini_batch + n) * alphabet_size + p]
- * \param [out] gradients if not NULL, then gradients are computed.  Should be
- *              allocated in the same memory space as probs and memory
- *              ordering is identical.
- * \param [in]  flat_labels Always in CPU memory.  A concatenation
- *              of all the labels for the minibatch.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size The number of possible output symbols.  There
- *              should be this many probabilities for each time step.
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [out] costs Always in CPU memory.  The cost of each example in the
- *              minibatch.
- * \param [in,out] workspace In same memory space as probs. Should be of
- *                 size requested by get_workspace_size.
- * \param [in]  options see struct ctcOptions
- *
- *  \return Status information
- *
- * */
-API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
-                                           float* gradients,
-                                           const int* const flat_labels,
-                                           const int* const label_lengths,
-                                           const int* const input_lengths,
-                                           int alphabet_size,
-                                           int minibatch,
-                                           float* costs,
-                                           void* workspace,
-                                           ctcOptions options);
-
-/** For a given set of labels and minibatch size return the required workspace
- *  size.  This will need to be allocated in the same memory space as your
- *  probabilities.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
- *              the number of probabilities at each time step
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [in]  info see struct ctcOptions
- * \param [out] size_bytes is pointer to a scalar where the memory
- *              requirement in bytes will be placed. This memory should be
- *allocated
- *              at the same place, CPU or GPU, that the probs are in
- *
- *  \return Status information
- **/
-API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
-                                             const int* const input_lengths,
-                                             int alphabet_size,
-                                             int minibatch,
-                                             ctcOptions info,
-                                             size_t* size_bytes);
-
-#ifdef __cplusplus
-}
-#endif
--- a/patches/warpctc/include/detail/cpu_ctc.h
+++ b/patches/warpctc/include/detail/cpu_ctc.h
--- a/patches/warpctc/include/detail/gpu_ctc.h
+++ b/patches/warpctc/include/detail/gpu_ctc.h
--- a/patches/warpctc/include/detail/gpu_ctc_kernels.h
+++ b/patches/warpctc/include/detail/gpu_ctc_kernels.h
--- a/patches/warpctc/include/detail/hostdevice.h
+++ b/patches/warpctc/include/detail/hostdevice.h
@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef __CUDACC__
-#define HOSTDEVICE __host__ __device__
-#else
-#define HOSTDEVICE
-#endif
-
-// NOTE(dzhwinter)
-// the warp primitive is different in cuda9(Volta) GPU.
-// add a wrapper to compatible with cuda7 to cuda9
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-#define DEFAULT_MASK 0u
-template <typename T>
-__forceinline__ __device__ T __shfl_down(T input, int delta) {
-  return __shfl_down_sync(DEFAULT_MASK, input, delta);
-}
-
-template <typename T>
-__forceinline__ __device__ T __shfl_up(T input, int delta) {
-  return __shfl_up_sync(DEFAULT_MASK, input, delta);
-}
-
-#endif
--- a/patches/warpctc/src/ctc_entrypoint.cpp
+++ b/patches/warpctc/src/ctc_entrypoint.cpp
@ -1,186 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cstddef>
-#include <iostream>
-
-#include <ctc.h>
-
-#include "detail/cpu_ctc.h"
-#ifdef __CUDACC__
-#include "detail/gpu_ctc.h"
-#endif
-
-extern "C" {
-
-int get_warpctc_version() { return 2; }
-
-const char* ctcGetStatusString(ctcStatus_t status) {
-  switch (status) {
-    case CTC_STATUS_SUCCESS:
-      return "no error";
-    case CTC_STATUS_MEMOPS_FAILED:
-      return "cuda memcpy or memset failed";
-    case CTC_STATUS_INVALID_VALUE:
-      return "invalid value";
-    case CTC_STATUS_EXECUTION_FAILED:
-      return "execution failed";
-
-    case CTC_STATUS_UNKNOWN_ERROR:
-    default:
-      return "unknown error";
-  }
-}
-
-ctcStatus_t compute_ctc_loss(const float* const activations,
-                             float* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float* costs,
-                             void* workspace,
-                             ctcOptions options) {
-  if (activations == nullptr || flat_labels == nullptr ||
-      label_lengths == nullptr || input_lengths == nullptr ||
-      costs == nullptr || workspace == nullptr || alphabet_size <= 0 ||
-      minibatch <= 0)
-    return CTC_STATUS_INVALID_VALUE;
-
-  if (options.loc == CTC_CPU) {
-    CpuCTC<float> ctc(alphabet_size,
-                      minibatch,
-                      workspace,
-                      options.num_threads,
-                      options.blank_label);
-
-    if (gradients != NULL)
-      return ctc.cost_and_grad(activations,
-                               gradients,
-                               costs,
-                               flat_labels,
-                               label_lengths,
-                               input_lengths);
-    else
-      return ctc.score_forward(
-          activations, costs, flat_labels, label_lengths, input_lengths);
-  } else if (options.loc == CTC_GPU) {
-#ifdef __CUDACC__
-    GpuCTC<float> ctc(alphabet_size,
-                      minibatch,
-                      workspace,
-                      options.stream,
-                      options.blank_label);
-
-    if (gradients != NULL)
-      return ctc.cost_and_grad(activations,
-                               gradients,
-                               costs,
-                               flat_labels,
-                               label_lengths,
-                               input_lengths);
-    else
-      return ctc.score_forward(
-          activations, costs, flat_labels, label_lengths, input_lengths);
-#else
-    std::cerr << "GPU execution requested, but not compiled with GPU support"
-              << std::endl;
-    return CTC_STATUS_EXECUTION_FAILED;
-#endif
-  } else {
-    return CTC_STATUS_INVALID_VALUE;
-  }
-}
-
-ctcStatus_t get_workspace_size(const int* const label_lengths,
-                               const int* const input_lengths,
-                               int alphabet_size,
-                               int minibatch,
-                               ctcOptions options,
-                               size_t* size_bytes) {
-  if (label_lengths == nullptr || input_lengths == nullptr ||
-      size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0)
-    return CTC_STATUS_INVALID_VALUE;
-
-  // This is the max of all S and T for all examples in the minibatch.
-  int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
-  int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
-
-  const int S = 2 * maxL + 1;
-
-  *size_bytes = 0;
-
-  if (options.loc == CTC_GPU) {
-    // GPU storage
-    // nll_forward, nll_backward
-    *size_bytes += 2 * sizeof(float) * minibatch;
-
-    // repeats
-    *size_bytes += sizeof(int) * minibatch;
-
-    // label offsets
-    *size_bytes += sizeof(int) * minibatch;
-
-    // utt_length
-    *size_bytes += sizeof(int) * minibatch;
-
-    // label lengths
-    *size_bytes += sizeof(int) * minibatch;
-
-    // labels without blanks - overallocate for now
-    *size_bytes += sizeof(int) * maxL * minibatch;
-
-    // labels with blanks
-    *size_bytes += sizeof(int) * S * minibatch;
-
-    // alphas
-    *size_bytes += sizeof(float) * S * maxT * minibatch;
-
-    // denoms
-    *size_bytes += sizeof(float) * maxT * minibatch;
-
-    // probs (since we will pass in activations)
-    *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
-
-  } else {
-    // cpu can eventually replace all minibatch with
-    // max number of concurrent threads if memory is
-    // really tight
-
-    // per minibatch memory
-    size_t per_minibatch_bytes = 0;
-
-    // output
-    per_minibatch_bytes += sizeof(float) * alphabet_size;
-
-    // alphas
-    per_minibatch_bytes += sizeof(float) * S * maxT;
-
-    // betas
-    per_minibatch_bytes += sizeof(float) * S;
-
-    // labels w/blanks, e_inc, s_inc
-    per_minibatch_bytes += 3 * sizeof(int) * S;
-
-    *size_bytes = per_minibatch_bytes * minibatch;
-
-    // probs
-    *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
-  }
-
-  return CTC_STATUS_SUCCESS;
-}
-}
--- a/patches/warpctc/src/reduce.cu
+++ b/patches/warpctc/src/reduce.cu
@ -1,217 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Includes, system
-// #include <stdio.h>
-// #include <stdlib.h>
-
-// Includes, cuda
-// #include <cuda_runtime.h>
-// #include <cublas_v2.h>
-
-// Includes, cuda helper functions
-// #include <helper_cuda.h>
-
-// For the functors
-#include "ctc.h"
-#include "detail/ctc_helper.h"
-
-const int warp_size = 32;
-
-template <int NT, typename T, typename Rop>
-struct CTAReduce;
-
-template <int NT, typename T, typename Rop>
-struct CTAReduce {
-  enum { Size = NT, Capacity = NT };
-  struct Storage {
-    T shared[Capacity];
-  };
-
-  __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
-    T* s = storage.shared;
-    s[tid] = x;
-    __syncthreads();
-
-// Fold the data in half with each pass.
-#pragma unroll
-    for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
-      if (tid + offset < count && tid < offset) {
-        // Read from the right half and store to the left half.
-        x = g(x, s[offset + tid]);
-        s[tid] = x;
-      }
-      __syncthreads();
-    }
-
-    T shuff;
-    for (int offset = warp_size / 2; offset > 0; offset /= 2) {
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-      shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
-#else
-      shuff = __shfl_down(x, offset);
-#endif
-      if (tid + offset < count && tid < offset) x = g(x, shuff);
-    }
-    return x;
-  }
-};
-
-template <int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_rows(
-    Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
-  typedef CTAReduce<NT, T, Rop> R;
-  __shared__ typename R::Storage storage;
-
-  int tid = threadIdx.x;
-  int idx = tid;
-  int col = blockIdx.x;
-  T curr;
-
-  // Each block works on a column
-  if (idx < num_rows) curr = f(input[idx + col * num_rows]);
-  idx += NT;
-
-  while (idx < num_rows) {
-    curr = g(curr, f(input[idx + col * num_rows]));
-    idx += NT;
-  }
-
-  // Sum thread-totals over the CTA.
-  curr = R::reduce(tid, curr, storage, num_rows, g);
-
-  // Store result in out
-  if (tid == 0) output[col] = curr;
-}
-
-template <int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_cols(
-    Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
-  __shared__ T s[NT];
-
-  int warps_per_block = NT / warp_size;
-  int row = blockDim.x * blockIdx.x + threadIdx.x;
-  int col = threadIdx.y;
-  T curr;
-
-  if (row < num_rows && col < num_cols) {
-    curr = f(input[row + col * num_rows]);
-    col += blockDim.y;
-    while (col < num_cols) {
-      curr = g(curr, f(input[row + col * num_rows]));
-      col += blockDim.y;
-    }
-  }
-  s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
-  __syncthreads();
-
-  // Reduce
-  if (threadIdx.y == 0 && row < num_rows) {
-#pragma unroll
-    for (int i = 1; i < warps_per_block && i < num_cols; ++i)
-      curr = g(curr, s[i + threadIdx.x * warps_per_block]);
-    output[row] = curr;
-  }
-}
-
-struct ReduceHelper {
-  template <typename T, typename Iof, typename Rof>
-  static void impl(Iof f,
-                   Rof g,
-                   const T* input,
-                   T* output,
-                   int num_rows,
-                   int num_cols,
-                   bool axis,
-                   cudaStream_t stream) {
-    int grid_size;
-
-    if (axis) {
-      grid_size = num_cols;
-      reduce_rows<128><<<grid_size, 128, 0, stream>>>(
-          f, g, input, output, num_rows, num_cols);
-
-    } else {
-      dim3 tpb(warp_size, 128 / warp_size);
-      grid_size = (num_cols + warp_size - 1) / warp_size;
-      reduce_cols<128><<<grid_size, tpb, 0, stream>>>(
-          f, g, input, output, num_rows, num_cols);
-    }
-  }
-};
-
-template <typename T, typename Iof, typename Rof>
-ctcStatus_t reduce(Iof f,
-                   Rof g,
-                   const T* input,
-                   T* output,
-                   int rows,
-                   int cols,
-                   bool axis,
-                   cudaStream_t stream) {
-  ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
-  cudaStreamSynchronize(stream);
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
-
-  return CTC_STATUS_SUCCESS;
-}
-
-ctcStatus_t reduce_negate(const float* input,
-                          float* output,
-                          int rows,
-                          int cols,
-                          bool axis,
-                          cudaStream_t stream) {
-  return reduce(ctc_helper::negate<float>(),
-                ctc_helper::add<float>(),
-                input,
-                output,
-                rows,
-                cols,
-                axis,
-                stream);
-}
-
-ctcStatus_t reduce_exp(const float* input,
-                       float* output,
-                       int rows,
-                       int cols,
-                       bool axis,
-                       cudaStream_t stream) {
-  return reduce(ctc_helper::exponential<float>(),
-                ctc_helper::add<float>(),
-                input,
-                output,
-                rows,
-                cols,
-                axis,
-                stream);
-}
-
-ctcStatus_t reduce_max(const float* input,
-                       float* output,
-                       int rows,
-                       int cols,
-                       bool axis,
-                       cudaStream_t stream) {
-  return reduce(ctc_helper::identity<float>(),
-                ctc_helper::maximum<float>(),
-                input,
-                output,
-                rows,
-                cols,
-                axis,
-                stream);
-}
--- a/patches/warpctc/tests/test.h
+++ b/patches/warpctc/tests/test.h
@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <stdexcept>
-#include <vector>
-
-#include <ctc.h>
-
-inline void throw_on_error(ctcStatus_t status, const char* message) {
-  if (status != CTC_STATUS_SUCCESS) {
-    throw std::runtime_error(
-        message + (", stat = " + std::string(ctcGetStatusString(status))));
-  }
-}
-
-#ifdef __CUDACC__
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-inline void throw_on_error(cudaError_t error, const char* message) {
-  if (error) {
-    throw thrust::system_error(error, thrust::cuda_category(), message);
-  }
-}
-
-#endif
-
-std::vector<float> genActs(int size) {
-  std::vector<float> arr(size);
-  std::mt19937 gen(0);
-  std::uniform_real_distribution<> dis(0, 1);
-  for (int i = 0; i < size; ++i) arr[i] = dis(gen);
-  return arr;
-}
-
-std::vector<int> genLabels(int alphabet_size, int L) {
-  std::vector<int> label(L);
-
-  std::mt19937 gen(1);
-  std::uniform_int_distribution<> dis(1, alphabet_size - 1);
-
-  for (int i = 0; i < L; ++i) {
-    label[i] = dis(gen);
-  }
-  // guarantee repeats for testing
-  if (L >= 3) {
-    label[L / 2] = label[L / 2 + 1];
-    label[L / 2 - 1] = label[L / 2];
-  }
-  return label;
-}
-
-float rel_diff(const std::vector<float>& grad,
-               const std::vector<float>& num_grad) {
-  float diff = 0.;
-  float tot = 0.;
-  for (size_t idx = 0; idx < grad.size(); ++idx) {
-    diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
-    tot += grad[idx] * grad[idx];
-  }
-
-  return diff / tot;
-}
-
-// Numerically stable softmax for a minibatch of 1
-void softmax(const float* const acts, int alphabet_size, int T, float* probs) {
-  for (int t = 0; t < T; ++t) {
-    float max_activation = -std::numeric_limits<float>::infinity();
-
-    for (int a = 0; a < alphabet_size; ++a)
-      max_activation = std::max(max_activation, acts[t * alphabet_size + a]);
-
-    float denom = 0;
-    for (int a = 0; a < alphabet_size; ++a)
-      denom += std::exp(acts[t * alphabet_size + a] - max_activation);
-
-    for (int a = 0; a < alphabet_size; ++a)
-      probs[t * alphabet_size + a] =
-          std::exp(acts[t * alphabet_size + a] - max_activation) / denom;
-  }
-}
--- a/patches/warpctc/tests/test_cpu.cpp
+++ b/patches/warpctc/tests/test_cpu.cpp
--- a/patches/warpctc/tests/test_gpu.cu
+++ b/patches/warpctc/tests/test_gpu.cu