Merge pull request #138 from gangliao/master

Add Mac OS X port
9 years ago · 2920b6bc0d
parent 1fc4352754 efea5c8460
commit 2920b6bc0d
59 changed files with 969 additions and 265 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,6 @@
 *.DS_Store
 build/
+*.user
+
+.vscode
+.idea
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
        ${OPENBLAS_ROOT}/include
        /usr/include
-        /usr/include/openblas)
+        /usr/include/openblas
+        /usr/local/opt/openblas/include)
 set(OPENBLAS_LIB_SEARCH_PATHS
        ${OPENBLAS_ROOT}/lib
        /usr/lib
        /usr/lib/blas/openblas
-        /usr/lib/openblas)
+        /usr/lib/openblas
+        /usr/local/opt/openblas/lib)

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
          NO_DEFAULT_PATH
    DOC "Path to cuDNN library.")
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -1,16 +1,55 @@
 # Some common routine for paddle compile.

-
 # target_circle_link_libraries
 # Link libraries to target which has circle dependencies.
 #
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    target_link_libraries(${TARGET_NAME}
-        -Wl,--start-group
-        ${ARGN}
-        -Wl,--end-group)
+    if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
+        endforeach()
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            list(APPEND LIBS "-undefined dynamic_lookup")
+        endif()
+        list(REVERSE libsInArgn)
+        target_link_libraries(${TARGET_NAME}
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
+        target_link_libraries(${TARGET_NAME}
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
+    endif()
 endfunction()

 # compile_cu_as_cpp
@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
    if(PADDLE_WITH_INTERNAL)
        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
        target_circle_link_libraries(${TARGET_NAME}
-            -Wl,--whole-archive
+            ARCHIVE_START
            paddle_internal_gserver
            paddle_internal_owlqn
-            -Wl,--no-whole-archive
+            ARCHIVE_END
            paddle_internal_parameter)
    else()
        set(INTERAL_LIBS "")
    endif()

    target_circle_link_libraries(${TARGET_NAME}
-        -Wl,--whole-archive
+        ARCHIVE_START
        paddle_gserver
        ${METRIC_LIBS}
-        -Wl,--no-whole-archive
+        ARCHIVE_END
        paddle_pserver
        paddle_trainer_lib
        paddle_network
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/TypeDefs.h"

 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"

 #include <fenv.h>
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@ -15,6 +15,19 @@
 try:
    from paddle_api_config import *
    import os.path
+    import platform
+
+    system = platform.system().lower()
+    is_osx = (system == 'darwin')
+    is_win = (system == 'windows')
+    is_lin = (system == 'linux')
+
+    if is_lin:
+        whole_start = "-Wl,--whole-archive"
+        whole_end = "-Wl,--no-whole-archive"
+    elif is_osx:
+        whole_start = ""
+        whole_end = ""

    LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
    PARENT_LIB_DIRS = ['proto']
@ -56,9 +69,9 @@ try:

        def libs_str(self):
            libs = [
-                "-Wl,--whole-archive",
+                whole_start,
                "-lpaddle_gserver",
-                "-Wl,--no-whole-archive",
+                whole_end,
                "-lpaddle_pserver",
                "-lpaddle_trainer_lib",
                "-lpaddle_network",
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@ -16,28 +16,37 @@ limitations under the License. */
 #ifndef HL_DEVICE_FUNCTIONS_CUH_
 #define HL_DEVICE_FUNCTIONS_CUH_

-namespace hppl {
-
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    // NOLINTNEXTLINE
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull,
-                        assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
+namespace paddle {
+
+template <class T>
+inline __device__ T paddleAtomicAdd(T* address, T val);

-}  // namespace hppl
+template <>
+inline __device__ float paddleAtomicAdd(float* address, float val) {
+  return atomicAdd(address, val);
+}

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-using hppl::atomicAdd;
+template <>
+inline __device__ double paddleAtomicAdd(double* address, double val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+  return atomicAdd(address, val);
+#else
+  // NOLINTNEXTLINE
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
 #endif
+}
+}  // namespace paddle
+

 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,

  if (isBatch) {
    if (value.prevStateValue) {
-      if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
    }
-    if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
  } else {
    if (value.prevStateValue) {
      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@ -27,6 +27,8 @@ typedef float4 vecType;
 typedef double2 vecType;
 #endif
 #else
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <emmintrin.h>
 #ifndef HPPL_TYPE_DOUBLE
 typedef __m128  vecType;
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@ -25,6 +25,9 @@ limitations under the License. */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
 #else
+#if   defined(__APPLE__) || defined(__OSX__)
+#define     _mm_set_pd1     _mm_set1_pd
+#endif
 /* number of double in vector */
 #define     VECTOR_LEN      2
 #define     VECTOR_SET      _mm_set_pd1
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
 bool hl_start_flag = false;

-#define gettid() syscall(SYS_gettid)
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  pid_t tid = syscall(SYS_thread_selfid);
+#else
+  #ifndef __NR_gettid
+  #define __NR_gettid 224
+  #endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE(tid, -1);
+  return tid;    
+}

 void hl_init(int device) {
  CHECK(hl_start_flag)
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,

  /* TODO: Temporary save & merger in another kernel */
  if (frameIdy == 1) {
-    if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
  } else if (frameIdy == 2) {
-    if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
  } else if (frameIdy == 3) {
-    if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
  }
 }

--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
        prevGradY[index] +=
          scale * grad[ty] * prevOutX[index] * reciprocal;
      } else {
-        atomicAdd(prevGradY + index,
+        paddle::paddleAtomicAdd(prevGradY + index,
          scale * grad[ty] * prevOutX[index] * reciprocal);
      }
    }
@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
          (prevOutX[index] * reciprocalXY -
           prevOutY[index] * reciprocalSquareSumY);
      } else {
-        atomicAdd(prevGradY + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
          (prevOutX[index] * reciprocalXY -
           prevOutY[index] * reciprocalSquareSumY));
      }
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
        if (AddRow == 0) {
          outputData[i] += tableData[i];
        } else {
-          atomicAdd(&tableData[i], outputData[i]);
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
        }
      }
    }
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
        if (index_n_t < dimN) {
          real tmp;
          tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
        }
@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
        if (index_n_t < dimN) {
          real tmp;
          tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
        }
@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
      for (int n=0; n < CU_DM_CSR_N; n++) {
        if (index_m_t++ < dimM) {
          tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += dimN;
        }
      }
@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
      for (int n=0; n < CU_DM_CSR_N; n++) {
        if (index_m_t++ < dimM) {
          tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += dimN;
        }
      }
@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
  for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
    int colIdx = csr_col[idx];
    real val = csr_val[idx];
-    atomicAdd(a_val + colIdx, val);
+    paddle::paddleAtomicAdd(a_val + colIdx, val);
  }
 }

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(

    CHECK(nullptr != *dso_handle)
      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << " Please make sure you already specify its path."
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
-      << "you must specify libcudart.so via LD_LIBRARY_PATH.";
+      << dlPath.c_str() << ". Please make sure you already specify its path. "
+      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
+      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
+      << "export DYLD_LIBRARY_PATH for MAC OS.";
 }

 void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
 }

 void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
 }

 void GetCudartDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+#endif
 }

 void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
      real *tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
-          atomicAdd(&tab[i], out[i]);
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
        } else {
          out[i] += tab[i];
        }
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
  provider_ = nullptr;

  // shuffle file list
-  std::random_shuffle(fileList_.begin(), fileList_.end());
+  std::shuffle(fileList_.begin(), fileList_.end(),
+      ThreadLocalRandomEngine::get());

  startLoader();
  DataProvider::reset();
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
 }

 void ProtoDataProvider::shuffle() {
-  std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
+  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
+      ThreadLocalRandomEngine::get());
 }

 /*
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
+

 namespace paddle {

@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
 }

 void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  int feFlag = fegetexcept();
  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
  classInstance_ =
      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
  std::string headerInfo =
      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
  parseHeaderData(headerInfo);
-  feenableexcept(feFlag);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }

 void PyDataProvider::parseHeaderData(const std::string& headerData) {
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@ -385,17 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
  }
 }

-extern NeuralNetwork* newCustomNeuralNetwork(
-    const std::string& name, NeuralNetwork* network) __attribute__((weak));
+extern NeuralNetwork* newCustomNerualNetwork(
+  const std::string& name, NeuralNetwork* network) __attribute__((weak));

 NeuralNetwork* NeuralNetwork::newNeuralNetwork(
    const std::string& name,
    NeuralNetwork* rootNetwork) {
-  if (newCustomNeuralNetwork) {
-    return newCustomNeuralNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
+    if (newCustomNerualNetwork) {
+      return newCustomNerualNetwork(name, rootNetwork);
+    } else {
+      return new NeuralNetwork(name, rootNetwork);
+    }
 }

 }  // namespace paddle
--- a/paddle/gserver/tests/concat_table_a.conf
+++ b/paddle/gserver/tests/concat_table_a.conf
@ -16,9 +16,9 @@

 from paddle.trainer_config_helpers import *

-settings(batch_size=1000)
+settings(batch_size=300)

-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)

 # emb1 is equal to emb2, note that bias_attr=false 
 # and act=LinearActivation() in default.
--- a/paddle/gserver/tests/concat_table_b.conf
+++ b/paddle/gserver/tests/concat_table_b.conf
@ -16,9 +16,9 @@

 from paddle.trainer_config_helpers import *

-settings(batch_size=1000)
+settings(batch_size=300)

-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)

 proj1 = table_projection(input=data, size=128)

--- a/Show More
+++ b/Show More