Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-ROIPooling
commit
c07cbf7daf
@ -0,0 +1,67 @@
|
||||
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
IF(NOT ${WITH_MKLDNN})
|
||||
return()
|
||||
ENDIF(NOT ${WITH_MKLDNN})
|
||||
|
||||
INCLUDE(ExternalProject)
|
||||
|
||||
SET(MKLDNN_PROJECT "extern_mkldnn")
|
||||
SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
|
||||
SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
|
||||
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
|
||||
|
||||
IF(WIN32 OR APPLE)
|
||||
MESSAGE(WARNING
|
||||
"Windows or Mac is not supported with MKLDNN in Paddle yet."
|
||||
"Force WITH_MKLDNN=OFF")
|
||||
SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
|
||||
return()
|
||||
ENDIF()
|
||||
|
||||
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
|
||||
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
|
||||
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
|
||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
|
||||
|
||||
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
|
||||
|
||||
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
|
||||
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
|
||||
SET(MKLDNN_MKLROOT ${MKLML_ROOT})
|
||||
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
|
||||
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
|
||||
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
|
||||
ENDIF()
|
||||
|
||||
ExternalProject_Add(
|
||||
${MKLDNN_PROJECT}
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
DEPENDS ${MKLDNN_DEPENDS}
|
||||
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
|
||||
GIT_TAG "v0.9"
|
||||
PREFIX ${MKLDNN_SOURCES_DIR}
|
||||
UPDATE_COMMAND ""
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
|
||||
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
|
||||
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
|
||||
-DMKLROOT:PATH=${MKLDNN_MKLROOT}
|
||||
)
|
||||
|
||||
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
|
||||
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
|
||||
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
|
||||
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
|
||||
LIST(APPEND external_project_dependencies mkldnn)
|
@ -0,0 +1,67 @@
|
||||
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
IF(NOT ${WITH_MKLML})
|
||||
return()
|
||||
ENDIF(NOT ${WITH_MKLML})
|
||||
|
||||
IF(WIN32 OR APPLE)
|
||||
MESSAGE(WARNING
|
||||
"Windows or Mac is not supported with MKLML in Paddle yet."
|
||||
"Force WITH_MKLML=OFF")
|
||||
SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
|
||||
return()
|
||||
ENDIF()
|
||||
|
||||
INCLUDE(ExternalProject)
|
||||
|
||||
SET(MKLML_PROJECT "extern_mklml")
|
||||
SET(MKLML_VER "mklml_lnx_2018.0.20170720")
|
||||
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
|
||||
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
|
||||
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
|
||||
SET(MKLML_DST_DIR "mklml")
|
||||
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
|
||||
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
|
||||
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER})
|
||||
SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
|
||||
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
|
||||
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
|
||||
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
|
||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
|
||||
|
||||
INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
|
||||
|
||||
FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
|
||||
"PROJECT(MKLML)\n"
|
||||
"cmake_minimum_required(VERSION 3.0)\n"
|
||||
"install(DIRECTORY ${MKLML_VER}\n"
|
||||
" DESTINATION ${MKLML_DST_DIR})\n")
|
||||
|
||||
ExternalProject_Add(
|
||||
${MKLML_PROJECT}
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
PREFIX ${MKLML_SOURCE_DIR}
|
||||
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
|
||||
DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
|
||||
DOWNLOAD_NO_PROGRESS 1
|
||||
UPDATE_COMMAND ""
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
|
||||
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
|
||||
)
|
||||
|
||||
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
|
||||
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
|
||||
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
|
||||
LIST(APPEND external_project_dependencies mklml)
|
@ -0,0 +1,101 @@
|
||||
# Alalysis of large model distributed training in Paddle
|
||||
|
||||
***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
|
||||
|
||||
## What is it
|
||||
|
||||
We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
|
||||
|
||||
## How to use
|
||||
|
||||
Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
|
||||
|
||||
Accrodingly, configure your embedding layers like:
|
||||
|
||||
```python
|
||||
SPARSE_REMOTE=True
|
||||
|
||||
w1 = data_layer(name="w1", size=dict_size)
|
||||
emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
|
||||
w2 = data_layer(name="w2", size=dict_size)
|
||||
emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
|
||||
...
|
||||
```
|
||||
|
||||
## Implementation details
|
||||
|
||||
```c++
|
||||
enum MatType {
|
||||
MAT_NORMAL,
|
||||
MAT_NORMAL_SHARED,
|
||||
MAT_VALUE_SHARED,
|
||||
MAT_SPARSE_ROW_IDS,
|
||||
MAT_SPARSE_ROW_AUTO_GROW,
|
||||
MAT_CACHE_ROW,
|
||||
MAT_SPARSE_ROW,
|
||||
MAT_SPARSE_ROW_PREFETCH,
|
||||
MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
|
||||
};
|
||||
```
|
||||
|
||||
`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
|
||||
|
||||
In `trainer_internal.cpp:L93 trainOneBatch`:
|
||||
|
||||
```c++
|
||||
if (config_->getOptConfig().use_sparse_remote_updater()) {
|
||||
REGISTER_TIMER("prefetch");
|
||||
gradientMachine_->prefetch(inArgs);
|
||||
parameterUpdater_->getParametersRemote();
|
||||
}
|
||||
```
|
||||
|
||||
When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
|
||||
|
||||
In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
|
||||
|
||||
```c++
|
||||
if (fullSize) {
|
||||
...
|
||||
} else {
|
||||
getParams = [&] {
|
||||
parameterClient_->getParameterSparse(
|
||||
/* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
|
||||
};
|
||||
applyL1 = [](Parameter& para, real decayRate) {
|
||||
para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
|
||||
|
||||
```c++
|
||||
void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
|
||||
std::vector<Buffer>& inputBuffers,
|
||||
SendParameterResponse* response,
|
||||
std::vector<Buffer>* outputBuffers) {
|
||||
(void)inputBuffers;
|
||||
auto& buffer = *readWriteBuffer_;
|
||||
size_t numReals = 0;
|
||||
for (const auto& block : request.blocks()) {
|
||||
numReals += getParameterConfig(block).dims(1);
|
||||
}
|
||||
buffer.resize(numReals);
|
||||
|
||||
VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
|
||||
|
||||
ReadLockGuard guard(parameterMutex_);
|
||||
size_t offset = 0;
|
||||
for (const auto& block : request.blocks()) {
|
||||
size_t width = getParameterConfig(block).dims(1);
|
||||
Buffer buf = {buffer.data() + offset, width};
|
||||
int type = request.send_back_parameter_type();
|
||||
sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
|
||||
offset += width;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
|
||||
then `getParameterSparse` remote call returns only one row of data to the client.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue