Merge branch 'develop' into core_fix_openblas_threads

wangkuiyi-patch-1
Liu Yiqun 7 years ago
commit 50ba205d79

@ -70,7 +70,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip install --upgrade pip==9.0.3 && \
RUN easy_install -U pip && \
pip install -U wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark

@ -159,6 +159,7 @@ def run_benchmark(model, args):
paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.metrics.Accuracy()
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
accuracy.reset()
@ -175,17 +176,20 @@ def run_benchmark(model, args):
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([len(y_data), 1])
outs = exe.run(
fluid.default_main_program(),
outs = train_exe.run(
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
fetch_list=[
avg_cost.name, batch_acc.name, batch_size_tensor.name
]
) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.update(value=outs[1], weight=outs[2])
accuracy.update(
value=np.array(np.mean(outs[1])),
weight=np.mean(np.array(outs[2])))
iters += 1
num_samples += len(y_data)
loss = np.array(outs[0])
acc = np.array(outs[1])
loss = np.mean(np.array(outs[0]))
acc = np.mean(np.array(outs[1]))
train_losses.append(loss)
train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %

@ -241,6 +241,7 @@ def run_benchmark(model, args):
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
accuracy = fluid.average.WeightedAverage()
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
if args.use_fake_data:
data = train_reader().next()
image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
@ -264,14 +265,17 @@ def run_benchmark(model, args):
data)).astype('float32')
label = np.array(map(lambda x: x[1], data)).astype('int64')
label = label.reshape([-1, 1])
loss, acc, weight = exe.run(
fluid.default_main_program(),
loss, acc, weight = train_exe.run(
feed={'data': image,
'label': label},
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
fetch_list=[
avg_cost.name, batch_acc.name, batch_size_tensor.name
])
iters += 1
num_samples += len(label)
accuracy.add(value=acc, weight=weight)
accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
loss = np.mean(np.array(loss))
acc = np.mean(np.array(acc))
train_losses.append(loss)
train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %

@ -169,6 +169,7 @@ def main():
iters, num_samples, start_time = 0, 0, time.time()
accuracy = fluid.average.WeightedAverage()
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
for pass_id in range(args.pass_num):
accuracy.reset()
train_accs = []
@ -184,14 +185,17 @@ def main():
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc, weight = exe.run(
fluid.default_main_program(),
loss, acc, weight = train_exe.run(
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
accuracy.add(value=acc, weight=weight)
fetch_list=[
avg_cost.name, batch_acc.name, batch_size_tensor.name
])
accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
iters += 1
num_samples += len(y_data)
loss = np.mean(np.array(loss))
acc = np.mean(np.array(acc))
print(
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc)

@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here.
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

@ -21,11 +21,12 @@ else()
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror"
# eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c
PREFIX ${EIGEN_SOURCE_DIR}
DOWNLOAD_NAME "eigen"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""

@ -53,11 +53,9 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.14"
GIT_TAG "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
# Patch MKLDNN to compile with gcc 4.8, the related issue is in intel/mkl-dnn#237.
PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/mkldnn.hpp ${MKLDNN_SOURCES_DIR}/src/extern_mkldnn/include/mkldnn.hpp
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}

@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.3.20180406")
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")

@ -47,8 +47,6 @@ ExternalProject_Add(
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
BUILD_COMMAND make -j8
INSTALL_COMMAND make install
)
add_library(snappy STATIC IMPORTED GLOBAL)

@ -46,8 +46,6 @@ ExternalProject_Add(
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
BUILD_COMMAND make -j8
INSTALL_COMMAND make install
DEPENDS snappy
)

@ -98,6 +98,14 @@ elseif (WITH_MKLML)
)
endif()
if(WITH_MKLDNN)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn")
copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
)
endif()
if(NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
copy(snappy_lib
@ -148,4 +156,10 @@ copy(string_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)
set(module "pybind")
copy(pybind_lib
SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
DSTS ${dst_dir}/${module}
)
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})

@ -40,7 +40,7 @@ template <typename T>
class FCOp : public OperatorBase {
public:
void Run(...) {
add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
}
};
REGISTER_OP(FCOp, "fc");

@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
endif()
add_subdirectory(testing)
if(NOT MOBILE_INFERENCE AND NOT RPI)
if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
add_subdirectory(fluid)
endif()

@ -36,9 +36,11 @@ void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
VLOG(3) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
dev_ctx->Wait();
TensorCopy(in, dst_place, *dev_ctx, out);
dev_ctx->Wait();
if (platform::is_gpu_place(in.place()) && platform::is_cpu_place(dst_place)) {
dev_ctx->Wait();
}
}
} // namespace framework

@ -58,6 +58,7 @@ static DataTypeMap* InitDataTypeMap() {
RegType(bool, proto::VarType::BOOL);
RegType(size_t, proto::VarType::SIZE_T);
RegType(int16_t, proto::VarType::INT16);
RegType(uint8_t, proto::VarType::UINT8);
#undef RegType
return retv;

@ -47,8 +47,14 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
case proto::VarType::BOOL:
visitor.template operator()<bool>();
break;
case proto::VarType::UINT8:
visitor.template operator()<uint8_t>();
break;
case proto::VarType::INT16:
visitor.template operator()<int16_t>();
break;
default:
PADDLE_THROW("Not supported");
PADDLE_THROW("Not supported %d", type);
}
}

@ -48,17 +48,18 @@ void FetchOpHandle::RunImpl() {
WaitInputVarGenerated(platform::CPUPlace());
tensors_.resize(inputs_.size());
auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var_handle->name_;
platform::CPUPlace cpu;
auto &scopes = *local_scopes_;
for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i];
auto *var =
scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
for (size_t i = 0; i < inputs_.size(); ++i) {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx_);
auto *var = scope->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_name);
var_handle->name_);
auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA

@ -98,7 +98,7 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
return false;
};
if (op.Type() == "split") {
if (op.Type() == "split" || op.Type() == "split_byref") {
return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
} else if (op.Type() == "concat") {
return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());

@ -70,6 +70,14 @@ class OpHandleBase {
const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
size_t NoDupInputSize() const {
std::unordered_set<VarHandleBase *> res;
for (auto *var : inputs_) {
res.emplace(var);
}
return res.size();
}
const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
protected:

@ -174,7 +174,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
void ThreadedSSAGraphExecutor::InsertPendingOp(
std::unordered_map<OpHandleBase *, size_t> *pending_ops,
OpHandleBase *op_instance) const {
pending_ops->insert({op_instance, op_instance->Inputs().size()});
pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
}
void ThreadedSSAGraphExecutor::InsertPendingVar(

@ -228,7 +228,8 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
bool create_vars, const std::string& feed_holder_name,
bool create_local_scope, bool create_vars,
const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
platform::RecordBlock b(kProgramId);
bool has_feed_ops =
@ -290,8 +291,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
}
auto ctx = Prepare(*copy_program, 0);
RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
feed_holder_name, fetch_holder_name);
RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
create_local_scope, create_vars, feed_holder_name,
fetch_holder_name);
}
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
@ -366,8 +368,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
void Executor::RunPreparedContext(
ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, bool create_vars,
const std::string& feed_holder_name, const std::string& fetch_holder_name) {
std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
auto& global_block = ctx->prog_.Block(ctx->block_id_);
PADDLE_ENFORCE(
@ -387,7 +390,7 @@ void Executor::RunPreparedContext(
}
}
RunPreparedContext(ctx, scope, create_vars, create_vars);
RunPreparedContext(ctx, scope, create_local_scope, create_vars);
// obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block.AllOps()) {

@ -57,7 +57,7 @@ class Executor {
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
bool create_vars = true,
bool create_local_scope = true, bool create_vars = true,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
@ -76,6 +76,7 @@ class Executor {
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
bool create_local_scope = true,
bool create_vars = true,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");

@ -103,6 +103,7 @@ message VarType {
FP64 = 6;
// Tensor<size_t> is used in C++.
SIZE_T = 19;
UINT8 = 20;
// Other types that may need additional descriptions
LOD_TENSOR = 7;

@ -228,11 +228,12 @@ TEST(LoD, CheckAbsLoD) {
ASSERT_FALSE(CheckAbsLoD(abs_lod0));
}
TEST(LoDTensor, RecordIO) {
template <typename T>
static void TestRecordIO() {
LoDTensor tensor;
int* tmp = tensor.mutable_data<int>(make_ddim({4, 5}), platform::CPUPlace());
T* tmp = tensor.mutable_data<T>(make_ddim({4, 5}), platform::CPUPlace());
for (int i = 0; i < 20; ++i) {
tmp[i] = i;
tmp[i] = static_cast<T>(i);
}
std::stringstream* stream = new std::stringstream();
@ -247,7 +248,7 @@ TEST(LoDTensor, RecordIO) {
auto assert_tensor_ok = [](const LoDTensor& tensor) {
for (int i = 0; i < 20; ++i) {
ASSERT_EQ(tensor.data<int>()[i], i);
ASSERT_EQ(tensor.data<T>()[i], static_cast<T>(i));
}
};
@ -265,5 +266,13 @@ TEST(LoDTensor, RecordIO) {
}
}
TEST(LoDTensor, RecordIO) {
TestRecordIO<int>();
TestRecordIO<int16_t>();
TestRecordIO<uint8_t>();
TestRecordIO<float>();
TestRecordIO<double>();
}
} // namespace framework
} // namespace paddle

@ -49,7 +49,7 @@ class OpConverter {
// convert fluid block to tensorrt network
void ConvertBlock(const framework::proto::BlockDesc& block,
TensorRTEngine* engine) {
for (size_t i = 0; i < block.ops_size(); i++) {
for (int i = 0; i < block.ops_size(); i++) {
const auto& op = block.ops(i);
OpConverter::Run(op, engine);
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save