Merge pull request #138 from gangliao/master

Add Mac OS X port
avx_docs
gangliao 8 years ago committed by GitHub
commit 2920b6bc0d

4
.gitignore vendored

@ -1,2 +1,6 @@
*.DS_Store
build/
*.user
.vscode
.idea

@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
set(OPENBLAS_INCLUDE_SEARCH_PATHS
${OPENBLAS_ROOT}/include
/usr/include
/usr/include/openblas)
/usr/include/openblas
/usr/local/opt/openblas/include)
set(OPENBLAS_LIB_SEARCH_PATHS
${OPENBLAS_ROOT}/lib
/usr/lib
/usr/lib/blas/openblas
/usr/lib/openblas)
/usr/lib/openblas
/usr/local/opt/openblas/lib)
find_path(OPENBLAS_INC_DIR NAMES cblas.h
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})

@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
/usr/lib)
find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
NO_DEFAULT_PATH
DOC "Path to cuDNN library.")

@ -1,16 +1,55 @@
# Some common routine for paddle compile.
# target_circle_link_libraries
# Link libraries to target which has circle dependencies.
#
# First Argument: target name want to be linked with libraries
# Rest Arguments: libraries which link together.
function(target_circle_link_libraries TARGET_NAME)
target_link_libraries(${TARGET_NAME}
-Wl,--start-group
${ARGN}
-Wl,--end-group)
if(APPLE)
set(LIBS)
set(inArchive OFF)
set(libsInArgn)
foreach(arg ${ARGN})
if(${arg} STREQUAL "ARCHIVE_START")
set(inArchive ON)
elseif(${arg} STREQUAL "ARCHIVE_END")
set(inArchive OFF)
else()
if(inArchive)
list(APPEND LIBS "-Wl,-force_load")
endif()
list(APPEND LIBS ${arg})
list(APPEND libsInArgn ${arg})
endif()
endforeach()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
list(APPEND LIBS "-undefined dynamic_lookup")
endif()
list(REVERSE libsInArgn)
target_link_libraries(${TARGET_NAME}
${LIBS}
${libsInArgn})
else() # LINUX
set(LIBS)
foreach(arg ${ARGN})
if(${arg} STREQUAL "ARCHIVE_START")
list(APPEND LIBS "-Wl,--whole-archive")
elseif(${arg} STREQUAL "ARCHIVE_END")
list(APPEND LIBS "-Wl,--no-whole-archive")
else()
list(APPEND LIBS ${arg})
endif()
endforeach()
target_link_libraries(${TARGET_NAME}
"-Wl,--start-group"
${LIBS}
"-Wl,--end-group")
endif()
endfunction()
# compile_cu_as_cpp
@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
if(PADDLE_WITH_INTERNAL)
set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
target_circle_link_libraries(${TARGET_NAME}
-Wl,--whole-archive
ARCHIVE_START
paddle_internal_gserver
paddle_internal_owlqn
-Wl,--no-whole-archive
ARCHIVE_END
paddle_internal_parameter)
else()
set(INTERAL_LIBS "")
endif()
target_circle_link_libraries(${TARGET_NAME}
-Wl,--whole-archive
ARCHIVE_START
paddle_gserver
${METRIC_LIBS}
-Wl,--no-whole-archive
ARCHIVE_END
paddle_pserver
paddle_trainer_lib
paddle_network

File diff suppressed because it is too large Load Diff

@ -20,6 +20,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/TypeDefs.h"
/// Import PaddlePaddle's enumeration into global namespace.
using namespace paddle::enumeration_wrapper; // NOLINT

@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/utils/Util.h"
#include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Flags.h"
#include "paddle/utils/Excepts.h"
#include "paddle/parameter/Parameter.h"
#include <fenv.h>

@ -15,6 +15,19 @@
try:
from paddle_api_config import *
import os.path
import platform
system = platform.system().lower()
is_osx = (system == 'darwin')
is_win = (system == 'windows')
is_lin = (system == 'linux')
if is_lin:
whole_start = "-Wl,--whole-archive"
whole_end = "-Wl,--no-whole-archive"
elif is_osx:
whole_start = ""
whole_end = ""
LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
PARENT_LIB_DIRS = ['proto']
@ -56,9 +69,9 @@ try:
def libs_str(self):
libs = [
"-Wl,--whole-archive",
whole_start,
"-lpaddle_gserver",
"-Wl,--no-whole-archive",
whole_end,
"-lpaddle_pserver",
"-lpaddle_trainer_lib",
"-lpaddle_network",

@ -16,28 +16,37 @@ limitations under the License. */
#ifndef HL_DEVICE_FUNCTIONS_CUH_
#define HL_DEVICE_FUNCTIONS_CUH_
namespace hppl {
static __inline__ __device__ double atomicAdd(double* address, double val) {
// NOLINTNEXTLINE
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
namespace paddle {
template <class T>
inline __device__ T paddleAtomicAdd(T* address, T val);
} // namespace hppl
template <>
inline __device__ float paddleAtomicAdd(float* address, float val) {
return atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
using hppl::atomicAdd;
template <>
inline __device__ double paddleAtomicAdd(double* address, double val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
return atomicAdd(address, val);
#else
// NOLINTNEXTLINE
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
#endif
}
} // namespace paddle
#endif /* HL_DEVICE_FUNCTIONS_CUH_ */

@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
if (isBatch) {
if (value.prevStateValue) {
if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
}
if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
} else {
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;

@ -27,6 +27,8 @@ typedef float4 vecType;
typedef double2 vecType;
#endif
#else
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#ifndef HPPL_TYPE_DOUBLE
typedef __m128 vecType;

@ -25,6 +25,9 @@ limitations under the License. */
#define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1
#else
#if defined(__APPLE__) || defined(__OSX__)
#define _mm_set_pd1 _mm_set1_pd
#endif
/* number of double in vector */
#define VECTOR_LEN 2
#define VECTOR_SET _mm_set_pd1

@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
__thread bool g_sync_flag = true;
bool hl_start_flag = false;
#define gettid() syscall(SYS_gettid)
inline pid_t gettid() {
#if defined(__APPLE__) || defined(__OSX__)
pid_t tid = syscall(SYS_thread_selfid);
#else
#ifndef __NR_gettid
#define __NR_gettid 224
#endif
pid_t tid = syscall(__NR_gettid);
#endif
CHECK_NE(tid, -1);
return tid;
}
void hl_init(int device) {
CHECK(hl_start_flag)

@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */
if (frameIdy == 1) {
if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 2) {
if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
} else if (frameIdy == 3) {
if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
}
}

@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
prevGradY[index] +=
scale * grad[ty] * prevOutX[index] * reciprocal;
} else {
atomicAdd(prevGradY + index,
paddle::paddleAtomicAdd(prevGradY + index,
scale * grad[ty] * prevOutX[index] * reciprocal);
}
}
@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
(prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY);
} else {
atomicAdd(prevGradY + index, output[ty] * grad[ty] *
paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
(prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY));
}

@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
if (AddRow == 0) {
outputData[i] += tableData[i];
} else {
atomicAdd(&tableData[i], outputData[i]);
paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
}
}
}

@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) {
real tmp;
tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
}
@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
if (index_n_t < dimN) {
real tmp;
tmp = alpha*a_r*b_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
}
@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN;
}
}
@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
for (int n=0; n < CU_DM_CSR_N; n++) {
if (index_m_t++ < dimM) {
tmp = alpha * b_r * a_r[n];
atomicAdd(C_d_r, tmp);
paddle::paddleAtomicAdd(C_d_r, tmp);
C_d_r += dimN;
}
}
@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
int colIdx = csr_col[idx];
real val = csr_val[idx];
atomicAdd(a_val + colIdx, val);
paddle::paddleAtomicAdd(a_val + colIdx, val);
}
}

@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(
CHECK(nullptr != *dso_handle)
<< "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
<< dlPath.c_str() << " Please make sure you already specify its path."
<< "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
<< "you must specify libcudart.so via LD_LIBRARY_PATH.";
<< dlPath.c_str() << ". Please make sure you already specify its path. "
<< "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
<< "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
<< "export DYLD_LIBRARY_PATH for MAC OS.";
}
void GetCublasDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
#endif
}
void GetCudnnDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
#endif
}
void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
#else
GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
#endif
}

@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
real *tab = table + tableId * ldt;
for (int i = idx; i < dim; i += blockDimX) {
if (AddRow) {
atomicAdd(&tab[i], out[i]);
paddle::paddleAtomicAdd(&tab[i], out[i]);
} else {
out[i] += tab[i];
}

@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
provider_ = nullptr;
// shuffle file list
std::random_shuffle(fileList_.begin(), fileList_.end());
std::shuffle(fileList_.begin(), fileList_.end(),
ThreadLocalRandomEngine::get());
startLoader();
DataProvider::reset();

@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
}
void ProtoDataProvider::shuffle() {
std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
ThreadLocalRandomEngine::get());
}
/*

@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/utils/PythonUtil.h"
#include <fenv.h>
#include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
namespace paddle {
@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
}
void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
int feFlag = fegetexcept();
VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
classInstance_ =
createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
std::string headerInfo =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
parseHeaderData(headerInfo);
feenableexcept(feFlag);
feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
}
void PyDataProvider::parseHeaderData(const std::string& headerData) {

@ -385,17 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
}
}
extern NeuralNetwork* newCustomNeuralNetwork(
const std::string& name, NeuralNetwork* network) __attribute__((weak));
extern NeuralNetwork* newCustomNerualNetwork(
const std::string& name, NeuralNetwork* network) __attribute__((weak));
NeuralNetwork* NeuralNetwork::newNeuralNetwork(
const std::string& name,
NeuralNetwork* rootNetwork) {
if (newCustomNeuralNetwork) {
return newCustomNeuralNetwork(name, rootNetwork);
} else {
return new NeuralNetwork(name, rootNetwork);
}
if (newCustomNerualNetwork) {
return newCustomNerualNetwork(name, rootNetwork);
} else {
return new NeuralNetwork(name, rootNetwork);
}
}
} // namespace paddle

@ -16,9 +16,9 @@
from paddle.trainer_config_helpers import *
settings(batch_size=1000)
settings(batch_size=300)
data = data_layer(name ="input", size=100000)
data = data_layer(name ="input", size=10000)
# emb1 is equal to emb2, note that bias_attr=false
# and act=LinearActivation() in default.

@ -16,9 +16,9 @@
from paddle.trainer_config_helpers import *
settings(batch_size=1000)
settings(batch_size=300)
data = data_layer(name ="input", size=100000)
data = data_layer(name ="input", size=10000)
proj1 = table_projection(input=data, size=128)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save