Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_unitests

wangkuiyi-patch-1
yi.wu 7 years ago
commit 40c631e554

@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20") SET(OPENBLAS_COMMIT "v0.2.20")

@ -17,7 +17,7 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE) endif(APPLE)
cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES}) cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context)
cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable) cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
cc_test(test_tape cc_test(test_tape

@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<void *> buffers;
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
t->Resize(dims); t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
} }
auto &nccl_ctx = member_->nccl_ctxs_->at(place); buffers.push_back(buffer);
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
} }
member_->nccl_ctxs_->WaitAll();
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
"variables' buffer size to bcast NOT equal to places");
{
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
}
member_->nccl_ctxs_->WaitAll();
}
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif

@ -20,16 +20,20 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
DEFINE_bool(init_p2p, false, "Whether to init p2p."); DEFINE_bool(init_p2p, false, "Whether to init p2p.");
DEFINE_int32(math_num_threads, 1,
"Number of threads used to run math functions.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void Init(const std::vector<std::string> argv) { void Init(const std::vector<std::string> argv) {
framework::InitGflags(argv); framework::InitGflags(argv);
operators::math::SetNumThreads(FLAGS_math_num_threads);
// init devices // init devices
std::vector<int> devices; std::vector<int> devices;
std::string token; std::string token;

@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name); auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = scope->FindVar(out_var_name); // out var must be created in local scope!
framework::Variable* outvar = scope->Var(out_var_name);
request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);

@ -20,13 +20,16 @@
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include <mkl_cblas.h>
#include <mkl_lapacke.h> #include <mkl_lapacke.h>
#include <mkl_service.h>
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
@ -46,6 +49,18 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
static void SetNumThreads(int num_threads) {
#ifdef PADDLE_USE_OPENBLAS
int real_num_threads = num_threads > 1 ? num_threads : 1;
openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
mkl_set_num_threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
}
/** /**
* Matrix Descriptor of a memory buffer. * Matrix Descriptor of a memory buffer.
* *

@ -21,8 +21,10 @@ limitations under the License. */
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {

@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
} }
} }
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// ncclGroupEnd will wait for all communicators to be initialized, which will
// cause blocking problem when a runtime_error was thrown, so try only guard
// NCCL actions when use it.
class NCCLGroupGuard { class NCCLGroupGuard {
public: public:
static std::mutex &NCCLMutex() { static std::mutex &NCCLMutex() {

@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef MATHFUNCTIONS_H_ #pragma once
#define MATHFUNCTIONS_H_
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include <mkl_cblas.h>
@ -21,7 +20,7 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#if defined(PADDLE_USE_VECLIB) #ifdef PADDLE_USE_VECLIB
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
#include <clapack.h> #include <clapack.h>
@ -30,8 +29,10 @@ extern "C" {
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
@ -126,5 +127,3 @@ template <class T>
void vTanh(const int n, const T* a, T* r); void vTanh(const int n, const T* a, T* r);
} // namespace paddle } // namespace paddle
#endif // MATHFUNCTIONS_H_

@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None):
seed_out = helper.create_tmp_variable(dtype="int64") seed_out = helper.create_tmp_variable(dtype="int64")
helper.append_op( helper.append_op(
type="random_crop", type="random_crop",
inputs={"X": input, inputs={"X": x,
"Seed": seed}, "Seed": seed},
outputs={"Out": out, outputs={"Out": out,
"SeedOut": seed_out}, "SeedOut": seed_out},

@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME)
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_dist_train)
#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
# TODO(wuyi): this test hungs on CI, will add it back later # TODO(wuyi): this test hungs on CI, will add it back later
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
@ -50,3 +50,5 @@ foreach(TEST_OP ${TEST_OPS})
endforeach(TEST_OP) endforeach(TEST_OP)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL) py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)

Loading…
Cancel
Save