fix merge conflict

avx_docs
qijun 9 years ago
commit db569f293e

@ -35,6 +35,8 @@ addons:
- libgoogle-glog-dev
- libgflags-dev
- libgtest-dev
- curl
- lcov
- graphviz
before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi

@ -9,7 +9,7 @@ set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATC
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
include(package)
include(swig)
find_package(SWIG 2.0)
find_package(CUDA QUIET)
find_package(Protobuf REQUIRED)
find_package(PythonLibs 2.7 REQUIRED)
@ -40,6 +40,9 @@ option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
option(ON_TRAVIS "Running test on travis-ci or not." OFF)
option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@ -49,11 +52,16 @@ endif()
include(enableCXX11)
include(cpplint)
include(ccache)
if(WITH_RDMA)
include(rdma)
endif()
include(util)
include(flags)
include(cudnn)
include(FindPythonModule)
include(check_packages)
include(swig)
include(coveralls)
# add PaddlePaddle version
if(DEFINED ENV{PADDLE_VERSION})
@ -129,9 +137,11 @@ else(WITH_PYTHON)
add_definitions(-DPADDLE_NO_PYTHON)
endif(WITH_PYTHON)
if(NOT WITH_RDMA)
add_definitions(-DPADDLE_DISABLE_RDMA)
endif()
if(WITH_RDMA)
include_directories("${RDMA_INC_DIR}")
else(WITH_RDMA)
add_definitions(-DPADDLE_DISABLE_RDMA)
endif(WITH_RDMA)
if(WITH_GLOG)
add_definitions(-DPADDLE_USE_GLOG)

@ -0,0 +1,14 @@
Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
Both Chinese and English issues are welcome.
It's hard to solve a problem when important details are missing.
Before submitting the issue, look over the following criteria before handing your request in.
- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
- [ ] Did you retrieve your issue from widespread search engines ?
- [ ] Is my description of the issue clear enough to reproduce this problem?
* If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
* If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
- [ ] Is my description of the issue use the github markdown correctly?
* Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
* Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.

@ -1,8 +1,10 @@
# PaddlePaddle
| **`Linux`** | **`License`** | **`Chat Room`** |
|----------------|---------------|-----------------|
|[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)|[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)|[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)|
[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
Welcome to the PaddlePaddle GitHub.

@ -1,4 +1,4 @@
# Find the CBlas libraries
# Find the CBlas and lapack libraries
#
# It will search MKL, atlas, OpenBlas, reference-cblas in order.
#
@ -19,6 +19,8 @@ set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS
${MKL_ROOT}/include)
find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
@ -37,6 +39,7 @@ if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
${MKL_SEQUENTIAL_LIB}
${MKL_CORE_LIB})
add_definitions(-DPADDLE_USE_MKL)
message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
return() # return file.
endif()
@ -55,15 +58,19 @@ set(ATLAS_LIB_SEARCH_PATHS
)
find_path(ATLAS_INC_DIR NAMES cblas.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
find_library(ATLAS_LIB NAMES atlas libatlas.so.3
find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS)
message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
return()
endif()
@ -83,6 +90,8 @@ set(OPENBLAS_LIB_SEARCH_PATHS
find_path(OPENBLAS_INC_DIR NAMES cblas.h
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_library(OPENBLAS_LIB NAMES openblas
PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
@ -90,6 +99,7 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set(CBLAS_PROVIDER OPENBLAS)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
set(CBLAS_LIBS ${OPENBLAS_LIB})
message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
return()
endif()

@ -0,0 +1,103 @@
# CMake script for code coverage.
# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
# Param _COVERAGE_SRCS A list of coverage source files.
# Param _COVERALLS_UPLOAD Upload the result to coveralls.
# Param _CMAKE_SCRIPT_PATH CMake script path.
function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
# clean previous gcov data.
file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
# find curl for upload JSON soon.
if (_COVERALLS_UPLOAD)
find_program(CURL_EXECUTABLE curl)
if (NOT CURL_EXECUTABLE)
message(FATAL_ERROR "Coveralls: curl not found!")
endif()
endif()
# When passing a CMake list to an external process, the list
# will be converted from the format "1;2;3" to "1 2 3".
set(COVERAGE_SRCS "")
foreach (SINGLE_SRC ${_COVERAGE_SRCS})
set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
endforeach()
# query number of logical cores
cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
# coveralls json file.
set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
add_custom_target(coveralls_generate
# Run regress tests.
COMMAND ${CMAKE_CTEST_COMMAND}
-j ${core_size}
--output-on-failure
# Generate Gcov and translate it into coveralls JSON.
COMMAND ${CMAKE_COMMAND}
-DCOVERAGE_SRCS="${COVERAGE_SRCS}"
-DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
-DCOV_PATH="${PROJECT_BINARY_DIR}"
-DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
-P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Coveralls: generating coveralls output..."
)
if (_COVERALLS_UPLOAD)
message("COVERALLS UPLOAD: ON")
# Upload the JSON to coveralls.
add_custom_target(coveralls_upload
COMMAND ${CURL_EXECUTABLE}
-S -F json_file=@${COVERALLS_FILE}
https://coveralls.io/api/v1/jobs
DEPENDS coveralls_generate
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Coveralls: uploading coveralls output...")
add_custom_target(coveralls DEPENDS coveralls_upload)
else()
message("COVERALLS UPLOAD: OFF")
add_custom_target(coveralls DEPENDS coveralls_generate)
endif()
endfunction()
if(ON_COVERALLS)
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
set(EXCLUDE_DIRS
"demo/"
"build/"
"tests/"
".test_env/"
)
if(WITH_GPU)
file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
else()
file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
endif()
# exclude trivial files in PADDLE_SOURCES
foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
foreach(TMP_PATH ${PADDLE_SOURCES})
string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
endif()
endforeach(TMP_PATH)
endforeach()
# convert to absolute path
set(PADDLE_SRCS "")
foreach(PADDLE_SRC ${PADDLE_SOURCES})
set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
endforeach()
code_coverage(
"${PADDLE_SRCS}"
${COVERALLS_UPLOAD}
"${PROJECT_SOURCE_DIR}/cmake"
)
endif()

File diff suppressed because it is too large Load Diff

@ -0,0 +1,76 @@
# user should download rdma first from subversion repository
# execute following instruction to download svn mannally
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
function(generate_rdma_links)
#redirect to current DIR to isolate the pollution from system runtime environment
#it can benifits unified control for different gcc environment.
#e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
#runtime libraries that will crash process while loading it. That redirect trick
#can fix it.
execute_process(
COMMAND mkdir -p librdma
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endfunction(generate_rdma_links)
#check and set headers
find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
#check and set libs
find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
if(
RDMA_INC_SXISOCK AND
RDMA_INC_XIO AND
RDMA_INC_EVENT AND
RDMA_INC_NUMA AND
RDMA_LIB_SXISOCK AND
RDMA_LIB_XIO AND
RDMA_LIB_EVENT AND
RDMA_LIB_EVENT_CORE AND
RDMA_LIB_EVENT_EXTRA AND
RDMA_LIB_EVENT_PTHREADS AND
RDMA_LIB_NUMA
)
set(RDMA_INC_DIR
${RDMA_INC_SXISOCK}
${RDMA_INC_XIO}
${RDMA_INC_EVENT}
${RDMA_INC_NUMA})
set(RDMA_LIBS
${RDMA_LIB_SXISOCK}
${RDMA_LIB_XIO}
${RDMA_LIB_EVENT}
${RDMA_LIB_EVENT_CORE}
${RDMA_LIB_EVENT_EXTRA}
${RDMA_LIB_EVENT_PTHREADS}
${RDMA_LIB_NUMA}
)
set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
return()
endif()
#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")

@ -1,25 +1,3 @@
find_program(
SWIG_BINARY_PATH
swig)
if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
set(SWIG_FOUND OFF)
else()
set(SWIG_FOUND ON)
endif()
set(MIN_SWIG_VERSION 2)
if(SWIG_FOUND)
execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
OUTPUT_VARIABLE _SWIG_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
"Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
set(SWIG_FOUND FALSE)
endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
endif(SWIG_FOUND)
function(generate_python_api target_name)
add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
${PROJ_ROOT}/paddle/Paddle_wrap.cxx

@ -67,6 +67,10 @@ endmacro()
#
# It will handle WITH_PYTHON/WITH_GLOG etc.
function(link_paddle_exe TARGET_NAME)
if(WITH_RDMA)
generate_rdma_links()
endif()
if(WITH_METRIC)
if(WITH_GPU)
set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
@ -109,6 +113,12 @@ function(link_paddle_exe TARGET_NAME)
${ZLIB_LIBRARIES}
${INTERAL_LIBS}
${CMAKE_DL_LIBS})
if(WITH_RDMA)
target_link_libraries(${TARGET_NAME}
${RDMA_LD_FLAGS}
${RDMA_LIBS})
endif()
if(WITH_PYTHON)
target_link_libraries(${TARGET_NAME}

@ -0,0 +1,10 @@
*.pyc
train.log
data/feature
data/conll05st-release/
data/src.dict
data/test.wsj.props
data/test.wsj.seq_pair
data/test.wsj.words
data/tgt.dict
output

@ -4,7 +4,6 @@ Installing from Sources
* [1. Download and Setup](#download)
* [2. Requirements](#requirements)
* [3. Build on Ubuntu](#ubuntu)
* [4. Build on Mac OS X](#mac)
## <span id="download">Download and Setup</span>
You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
@ -191,122 +190,3 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
# or just run
sudo paddle version
```
## <span id="mac">Building on Mac OS X</span>
### Prerequisites
This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X,
you will already have Python 2.7.10 and Numpy 1.8 installed.
The best option is to use the package manager homebrew to handle installations and upgrades for you.
To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
```bash
# install brew
/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
# install pip
easy_install pip
```
### Install Dependencies
- **CPU Dependencies**
```bash
# Install fundamental dependents
brew install glog gflags cmake protobuf openblas
# Install google test on Mac OS X
# Download gtest 1.7.0
wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
# Build gtest
mkdir build && cmake ..
make
# Install gtest library
sudo cp -r ../include/gtest /usr/local/include/
sudo cp lib*.a /usr/local/lib
```
- **GPU Dependencies(optional)**
To build GPU version, you will need the following installed:
1. a CUDA-capable GPU
2. Mac OS X 10.11 or later
2. the Clang compiler and toolchain installed using Xcode
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
distribution versions that have been qualified for this CUDA Toolkit release.
1. After downloading cuDNN library, issue the following commands:
```bash
sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib/libcudnn*
```
2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
```bash
export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
```
### Build and Install
As usual, the best option is to create build folder under paddle project directory.
```bash
mkdir build && cd build
cmake ..
```
CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
If still not found, you can manually set it based on CMake error information from your screen.
As a simple example, consider the following:
- **Only CPU**
```bash
cmake .. -DWITH_GPU=OFF
```
- **GPU**
```bash
cmake .. -DWITH_GPU=ON
```
- **GPU with doc and swig**
```bash
cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
```
Finally, you can build PaddlePaddle:
```bash
# you can add build option here, such as:
cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<installation path>
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `sysctl -n hw.ncpu` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=<installation path>/bin:$PATH
```
**Note:**
If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
Otherwise, PaddlePaddle will automatically install python dependencies
at first time when user run paddle commands, such as `paddle version`, `paddle train`.
It may require sudo privileges:
```bash
# you can run
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
# or just run
sudo paddle version
```

@ -32,6 +32,13 @@ LinearActivation
.. automodule:: paddle.trainer_config_helpers.activations
:members: LinearActivation
:noindex:
LogActivation
==================
.. automodule:: paddle.trainer_config_helpers.activations
:members: LogActivation
:noindex:
SquareActivation
================

@ -21,8 +21,8 @@ limitations under the License. */
/**
* @brief Matrix transpose: C_d = T(A_d)
*
* @param[in] A_d input matrix (M x N).
* @param[out] C_d output matrix (N x M).
* @param[in] A_d input matrix (dimM x dimN).
* @param[out] C_d output matrix (dimN x dimM).
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
* @param[in] lda the first dimension of A_d.
@ -39,8 +39,8 @@ extern void hl_matrix_transpose(real *A_d,
/*
* @brief Matrix transpose, while lda = dimN, ldc = dimM.
*
* @param[in] A_d input matrix (M x N).
* @param[out] C_d output matrix (N x M).
* @param[in] A_d input matrix (dimM x dimN).
* @param[out] C_d output matrix (dimN x dimM).
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*
@ -50,6 +50,22 @@ extern void hl_matrix_transpose(real *A_d,
int dimM,
int dimN);
/*
* @brief Matrix inverse
*
* @param[in] A_d input matrix (dimN x dimN).
* @param[out] C_d output matrix (dimN x dimN).
* @param[in] dimN matrix height = matrix width
* @param[in] lda the first dimension of A_d
* @param[in] ldc the first dimension of C_d
*
*/
extern void hl_matrix_inverse(real *A_d,
real *C_d,
int dimN,
int lda,
int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
*

@ -30,6 +30,12 @@ inline void hl_matrix_transpose(real *A_d,
int dimM,
int dimN) {}
inline void hl_matrix_inverse(real *A_d,
real *C_d,
int dimN,
int lda,
int ldc) {}
inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
real *B_d, hl_trans_op_t transb,
real *C_d,

@ -15,6 +15,7 @@ limitations under the License. */
#include <sys/time.h>
#include <mutex>
#include "hl_cuda.h"
#include "hl_cuda_cublas.h"
#include "hl_thread.ph"
#include "hl_dso_loader.h"
@ -75,6 +76,10 @@ DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
#undef DYNAMIC_LOAD_CUBLAS_WRAP
@ -88,10 +93,14 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
#define CUBLAS_GEAM dynload::cublasSgeam
#define CUBLAS_GEMV dynload::cublasSgemv
#define CUBLAS_GEMM dynload::cublasSgemm
#define CUBLAS_GETRF dynload::cublasSgetrfBatched
#define CUBLAS_GETRI dynload::cublasSgetriBatched
#else
#define CUBLAS_GEAM dynload::cublasDgeam
#define CUBLAS_GEMV dynload::cublasDgemv
#define CUBLAS_GEMM dynload::cublasDgemm
#define CUBLAS_GETRF dynload::cublasDgetrfBatched
#define CUBLAS_GETRI dynload::cublasDgetriBatched
#endif
const char* hl_cublas_get_error_string(cublasStatus_t status) {
@ -162,6 +171,54 @@ void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
}
void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
/* Solve Ax = I */
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d);
/* Step 1: Compute the LU decomposition of matrix A */
real **inout_h = &A_d;
real **inout_d = (real **)hl_malloc_device(sizeof(real *));
hl_memcpy(inout_d, inout_h, sizeof(real *));
int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));
int *info_d = (int *)t_resource.gpu_mem;
/* Note: cublasSgetrfBatched is used to calculate a number of
small-sized matrices. There may be a better way to reconstruct
the API for better performance.
*/
CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
dimN, inout_d, lda, pivot_d,
info_d, 1));
int info_h;
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
}
/* Step 2: Compute the inverse of the matrix given its LU decomposition */
real **out_h = &C_d;
real **out_d = (real **)hl_malloc_device(sizeof(real *));
hl_memcpy(out_d, out_h, sizeof(real *));
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
dimN, (const real **)inout_d, lda, pivot_d,
out_d, ldc, info_d, 1));
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
}
hl_free_mem_device(inout_d);
hl_free_mem_device(pivot_d);
hl_free_mem_device(out_d);
CHECK_SYNC("hl_matrix_inverse failed");
}
void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
real *B_d, hl_trans_op_t transb,
real *C_d,

@ -41,65 +41,28 @@ void* cudnn_dso_handle = nullptr;
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cudnnStatus_t operator()(Args... args) { \
typedef cudnnStatus_t (*cudnnFunc)(Args...); \
std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, \
&cudnn_dso_handle); \
void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnnFunc>(p_##__name)(args...); \
} \
#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudnn_func = decltype(__name(args...))(*)(Args...); \
std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, \
&cudnn_dso_handle); \
void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
struct DynLoad__cudnnGetVersion {
template <typename... Args>
size_t operator()(Args... args) {
typedef size_t (*cudnnFunc)(Args...);
std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
&cudnn_dso_handle);
void* p_name = dlsym(cudnn_dso_handle, "cudnnGetVersion");
return reinterpret_cast<cudnnFunc>(p_name)(args...);
}
} cudnnGetVersion; /* struct DynLoad__##__name */
struct DynLoad__cudnnGetErrorString {
template <typename... Args>
const char* operator()(Args... args) {
typedef const char* (*cudnnFunc)(Args...);
std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
&cudnn_dso_handle);
void* p_name = dlsym(cudnn_dso_handle, "cudnnGetErrorString");
return reinterpret_cast<cudnnFunc>(p_name)(args...);
}
} cudnnGetErrorString; /* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cudnnStatus_t operator()(Args... args) { \
return __name(args...); \
} \
#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
return __name(args...); \
} \
} __name; /* struct DynLoad__##__name */
struct DynLoad__cudnnGetVersion {
template <typename... Args>
size_t operator()(Args... args) {
return cudnnGetVersion(args...);
}
} cudnnGetVersion; /* struct DynLoad__##__name */
struct DynLoad__cudnnGetErrorString {
template <typename... Args>
const char* operator()(Args... args) {
return cudnnGetErrorString(args...);
}
} cudnnGetErrorString; /* struct DynLoad__##__name */
#endif
/**
@ -133,7 +96,9 @@ struct DynLoad__cudnnGetErrorString {
__macro(cudnnPoolingForward) \
__macro(cudnnPoolingBackward) \
__macro(cudnnSoftmaxBackward) \
__macro(cudnnSoftmaxForward)
__macro(cudnnSoftmaxForward) \
__macro(cudnnGetVersion) \
__macro(cudnnGetErrorString)
CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \

@ -85,44 +85,24 @@ void* cudart_dso_handle = nullptr;
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cudaError_t operator()(Args... args) { \
typedef cudaError_t (*cudartFunc)(Args...); \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudart_func = decltype(__name(args...))(*)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
&cudart_dso_handle); \
void* p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudartFunc>(p_##__name)(args...); \
return reinterpret_cast<cudart_func>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cudaError_t operator()(Args... args) { \
auto operator()(Args... args) -> decltype(__name(args...)) { \
return __name(args...); \
} \
} __name; /* struct DynLoad__##__name */
#endif
#ifdef PADDLE_USE_DSO
struct DynLoad__cudaGetErrorString {
template <typename... Args>
const char* operator()(Args... args) {
typedef const char* (*cudaFunc)(Args...);
std::call_once(cudart_dso_flag, GetCudartDsoHandle,
&cudart_dso_handle);
void* p_func = dlsym(cudart_dso_handle, "cudaGetErrorString");
return reinterpret_cast<cudaFunc>(p_func)(args...);
}
} cudaGetErrorString; /* struct DynLoad__cudaGetErrorString */
#else
struct DynLoad__cudaGetErrorString {
template <typename... Args>
const char* operator()(Args... args) {
return cudaGetErrorString(args...);
}
} cudaGetErrorString; /* struct DynLoad__cudaGetErrorString */
#endif
/* include all needed cuda functions in HPPL */
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
@ -152,7 +132,8 @@ struct DynLoad__cudaGetErrorString {
__macro(cudaSetDeviceFlags) \
__macro(cudaGetLastError) \
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion)
__macro(cudaRuntimeGetVersion) \
__macro(cudaGetErrorString)
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)

@ -49,14 +49,14 @@ static inline std::string join(const std::string& part1, const std::string& part
static inline void GetDsoHandleFromDefaultPath(
std::string& dso_path, void** dso_handle, int dynload_flags) {
LOG(INFO) << "Try to find cuda library: " << dso_path
<< "from default system path.";
<< " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS.
#if defined(__APPLE__) or defined(__OSX__)
#if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) {
dso_path = join("/usr/local/cuda/lib/", dso_path);
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);

@ -295,6 +295,7 @@ void forward(Argument& act) {
void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
END_DEFINE_ACTIVATION(square)
/**
* @brief Exponential Activation.
* \f[
@ -307,8 +308,36 @@ void forward(Argument& act) { act.value->exp(*act.value); }
void backward(Argument& act) { act.grad->expDerivative(*act.value); }
END_DEFINE_ACTIVATION(exponential)
/**
* @brief Logarithm Activation.
* \f[
* f(z) = log(z)
* \f]
*/
BEGIN_DEFINE_ACTIVATION(log)
void forward(Argument& act) {
SetDevice device(act.deviceId);
Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
/* trans */ false, useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->log(*act.value);
}
void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
END_DEFINE_ACTIVATION(log)
ActivationFunction* ActivationFunction::create(const std::string& type) {
return gActivationRegistrar.createByType(type);
}
std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
std::vector<std::string> types;
gActivationRegistrar.forEachType([&](const std::string& type) {
types.push_back(type);
});
return types;
}
} // namespace paddle

@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
namespace paddle {
@ -32,6 +33,7 @@ struct Argument;
class ActivationFunction {
public:
static ActivationFunction* create(const std::string& type);
static std::vector<std::string> getAllRegisteredTypes();
ActivationFunction() {}

@ -131,9 +131,10 @@ void DoubleBuffer::asyncLoadBatch() {
taskReadySem_.wait();
if (stopping_) break;
while (batchSize_ == 0) {
while (batchSize_ == 0 && !stopping_) {
usleep(5);
}
if (stopping_) break;
do {
DataBatch newBatch;

@ -433,26 +433,34 @@ private:
inline void resetImpl(bool startNewThread) {
DBG << "Reseting " << startNewThread;
exit_.store(true);
if (loadThread_) { // is loading.
exit_.store(true);
loadThread_->join();
loadThread_.reset();
}
{
PyGuard g;
callingContexts_.clear();
this->pullCV_.notify_one();
}
std::lock_guard<std::mutex> guard(mutexForReset_);
{
PyGuard g;
dataPool_.clear();
}
poolActualSize_ = 0;
exit_ = false;
if (startNewThread && cache_->reset()) {
DBG << "Start new thread.";
loadThread_.reset(new std::thread([this] {
exit_ = false;
loadThread();
}));
callingContextCreated_.wait();
}
DBG << "Reset done";
exit_ = false;
}
private:
@ -465,6 +473,8 @@ private:
std::condition_variable pullCV_;
std::mutex mtx_;
std::mutex mutexForReset_;
ThreadBarrier callingContextCreated_;
std::unique_ptr<IPyDataProviderCache> cache_;
@ -529,6 +539,7 @@ public:
* Loading a batch of data.
*/
int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
std::lock_guard<std::mutex> guard(mutexForReset_);
REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0);
size_t size = (size_t) size_;
@ -554,6 +565,10 @@ public:
} else { // loading from cache.
poolPtr = this->cache_->load();
}
if (exit_) {
// PyDataProvider is destructing.
return 0;
}
CHECK(poolPtr != nullptr);
std::deque<PyObjectPtr>& pool = *poolPtr;

@ -28,6 +28,12 @@ void ParallelNeuralNetwork::init(
const std::vector<ParameterType>& parameterTypes, bool useGpu) {
NeuralNetwork::init(config, callback, parameterTypes, useGpu);
if (config.type() == "recurrent_nn") {
LOG(FATAL)
<< "You can not add `--parallel_nn=true` on the command line, "
<< "parallel_nn training mode does not support the recurrent_nn model.";
}
useGpu_ = useGpu;
numDevices_ = 0;
if (useGpu_) {

@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Logging.h"
#include "ConvBaseLayer.h"
namespace paddle {
@ -78,10 +77,10 @@ size_t ConvBaseLayer::calOutputSize() {
imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
if (imgSizeW_[i] == 0)
imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
outputH_.push_back(
outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
outputW_.push_back(
outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
outputH_.push_back(outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i],
strideY_[i], caffeMode_));
outputW_.push_back(outputSize(imgSizeW_[i], filterSize_[i], padding_[i],
stride_[i], caffeMode_));
CHECK_EQ(outputH_[i], outputH_[0]);
CHECK_EQ(outputW_[i], outputW_[0]);
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save