Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-listen-and-serv-op

release/0.12.0
qiaolongfei 7 years ago
commit 1d75674614

@ -1,5 +1,8 @@
# A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment
# When you modify it, please be aware of cudnn-runtime version
# and libcudnn.so.x in paddle/scripts/docker/build.sh
FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>

@ -146,6 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return;
}
need_update_ = true;
ops_.erase(ops_.begin() + s, ops_.begin() + e);
}

@ -103,9 +103,7 @@ static void BuildVar(const std::string& param_name,
}
TEST(Operator, CPUtoGPU) {
using namespace paddle::framework;
using namespace paddle::platform;
InitDevices(true);
paddle::framework::InitDevices(true);
paddle::framework::Scope scope;
paddle::platform::CPUPlace cpu_place;
@ -118,8 +116,9 @@ TEST(Operator, CPUtoGPU) {
auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
// prepare input
auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
auto* src_ptr =
in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
for (int i = 0; i < 2 * 3; ++i) {
src_ptr[i] = static_cast<float>(i);
}
@ -128,7 +127,7 @@ TEST(Operator, CPUtoGPU) {
auto* output = scope.Var("OUT1");
cpu_op->Run(scope, cpu_place);
auto* output_ptr = output->Get<LoDTensor>().data<float>();
auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
}
@ -153,12 +152,14 @@ TEST(Operator, CPUtoGPU) {
VLOG(3) << "after gpu_op run";
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
DeviceContextPool& pool = DeviceContextPool::Instance();
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Get(cuda_place);
paddle::framework::Tensor output_tensor;
TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
dev_ctx->Wait();
float* output2_ptr = output_tensor.data<float>();

@ -18,27 +18,28 @@
#include "paddle/fluid/platform/device_context.h"
TEST(DataTransform, DataLayoutFunction) {
using namespace paddle::framework;
using namespace paddle::platform;
auto place = CPUPlace();
Tensor in = Tensor();
Tensor out = Tensor();
in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
in.set_layout(DataLayout::kNHWC);
auto kernel_nhwc = OpKernelType(proto::VarType::FP32, place,
DataLayout::kNHWC, LibraryType::kPlain);
auto kernel_ncwh = OpKernelType(proto::VarType::FP32, place,
DataLayout::kNCHW, LibraryType::kPlain);
TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
auto place = paddle::platform::CPUPlace();
paddle::framework::Tensor in = paddle::framework::Tensor();
paddle::framework::Tensor out = paddle::framework::Tensor();
in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
in.set_layout(paddle::framework::DataLayout::kNHWC);
auto kernel_nhwc = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, place,
paddle::framework::DataLayout::kNHWC,
paddle::framework::LibraryType::kPlain);
auto kernel_ncwh = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, place,
paddle::framework::DataLayout::kNCHW,
paddle::framework::LibraryType::kPlain);
paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
}

@ -17,43 +17,58 @@ limitations under the License. */
#include "gtest/gtest.h"
TEST(DataTypeTransform, CPUTransform) {
using namespace paddle::framework;
using namespace paddle::platform;
auto place = CPUPlace();
auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto place = paddle::platform::CPUPlace();
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_fp32 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_fp64 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP64, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_int32 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::INT32, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_int64 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::INT64, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_bool = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::BOOL, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
// data type transform from float32
{
Tensor in;
Tensor out;
paddle::framework::Tensor in;
paddle::framework::Tensor out;
float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
float* ptr =
in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
int data_number = 2 * 3;
for (int i = 0; i < data_number; ++i) {
ptr[i] = i / 3;
}
TransDataType(kernel_fp32, kernel_fp64, in, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
}
TransDataType(kernel_fp32, kernel_int32, in, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
// data type transform from/to float16
{
Tensor in;
Tensor out;
paddle::framework::Tensor in;
paddle::framework::Tensor out;
float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
paddle::framework::make_ddim({2, 3}), place);
int data_number = 2 * 3;
for (int i = 0; i < data_number; ++i) {
@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
}
// transform from float16 to other data types
TransDataType(kernel_fp16, kernel_fp32, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
float* out_data_float = out.data<float>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_fp64, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int32, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int64, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
int64_t* out_data_int64 = out.data<int64_t>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_bool, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
bool* out_data_bool = out.data<bool>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
}
// transform float to float16
float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
float* in_data_float =
in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_float[i] = i;
}
TransDataType(kernel_fp32, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_float[i]).x);
}
// transform double to float16
double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
double* in_data_double =
in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_double[i] = i;
}
TransDataType(kernel_fp64, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_double[i]).x);
}
// transform int to float16
int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
int* in_data_int =
in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_int[i] = i;
}
TransDataType(kernel_int32, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_int[i]).x);
}
// transform int64 to float16
int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
int64_t* in_data_int64 =
in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_int64[i] = i;
}
TransDataType(kernel_int64, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_int64[i]).x);
}
// transform bool to float16
bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
bool* in_data_bool =
in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_bool[i] = i;
}
TransDataType(kernel_bool, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_bool[i]).x);
}
}
}

File diff suppressed because it is too large Load Diff

@ -8,27 +8,28 @@ cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope plac
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
if(WITH_GPU)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
else()
set(multi_devices_graph_builder_deps)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
endif()
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base variable_visitor scope ddim memory)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope variable_visitor ddim memory)
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory

@ -44,9 +44,15 @@ void BroadcastOpHandle::RunImpl() {
// &in_place;
WaitInputVarGenerated(*in_var_handle);
auto *in_var = local_scopes_.at(in_var_handle->scope_idx_)
->FindVar(in_var_handle->name_);
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
auto *in_var =
var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(in_var);
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
for (auto *out : out_var_handles) {
@ -55,17 +61,16 @@ void BroadcastOpHandle::RunImpl() {
}
auto &out_p = out->place_;
auto *out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
"Places must be all on CPU or all on CUDA.");
VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
VariableVisitor::GetMutableTensor(out_var)
.Resize(in_tensor.dims())
.mutable_data(out_p, in_tensor.type());
VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
in_tensor.type());
auto dev_ctx = dev_ctxes_[out_p];
auto dev_ctx = dev_ctxes_.at(out_p);
RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
paddle::framework::TensorCopy(
in_tensor, out_p, *(dev_ctx),

@ -30,6 +30,7 @@ const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
@ -72,11 +73,17 @@ struct TestBroadcastOpHandle {
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("out");
param_scopes_.emplace_back(&local_scope);
}
local_scopes_[input_scope_idx]->Var("input");
param_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
auto* in_var_handle =
new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
@ -105,7 +112,8 @@ struct TestBroadcastOpHandle {
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@ -117,6 +125,7 @@ struct TestBroadcastOpHandle {
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
in_lod_tensor->set_lod(lod);
in_lod_tensor->Resize(kDims);
op_handle_->Run(false);
@ -124,7 +133,8 @@ struct TestBroadcastOpHandle {
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scopes_[j]->Var("out");
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_tensor = out_var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
@ -139,7 +149,8 @@ struct TestBroadcastOpHandle {
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@ -162,7 +173,8 @@ struct TestBroadcastOpHandle {
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scopes_[j]->Var("out");
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();

@ -14,7 +14,7 @@
#pragma once
#include <memory>
#include <thread>
#include <thread> // NOLINT
namespace paddle {
namespace framework {
@ -23,7 +23,7 @@ namespace details {
// Change it to thread safe flags if needed.
class ThreadUnsafeOwnershipFlags {
public:
ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(

@ -41,14 +41,19 @@ void GatherOpHandle::RunImpl() {
out_var_handle = out_var_handles.front();
}
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
auto in_0_handle = in_var_handles[0];
auto pre_in_var =
local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
auto pre_place = in_0_handle->place_;
var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
"Currently, gather_op only can gather SelectedRows.");
auto pre_place = in_0_handle->place_;
PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
"The place of input and output should be the same.");
@ -67,7 +72,7 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
auto *in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::SelectedRows>();
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
@ -86,7 +91,7 @@ void GatherOpHandle::RunImpl() {
// write the output
auto &out_place = out_var_handle->place_;
auto out_scope_idx = out_var_handle->scope_idx_;
auto out_var = local_scopes_[out_scope_idx]->FindVar(out_var_handle->name_);
auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_);
auto out = out_var->GetMutable<framework::SelectedRows>();
out->set_height(pre_in.height());

@ -29,6 +29,7 @@ const f::DDim kDims = {20, 20};
struct TestGatherOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
@ -71,9 +72,14 @@ struct TestGatherOpHandle {
void InitGatherOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("input");
param_scopes_.emplace_back(&local_scope);
}
local_scopes_[input_scope_idx]->Var("input");
param_scopes_[input_scope_idx]->Var("out");
op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
// add input
@ -115,7 +121,8 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@ -128,10 +135,11 @@ struct TestGatherOpHandle {
value->Resize(kDims);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
out_selected_rows->mutable_value()->ShareDataWith(
@ -155,7 +163,8 @@ struct TestGatherOpHandle {
f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(kDims); ++j) {
for (int64_t j = 0;
j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
}
}

@ -43,21 +43,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
int dtype = -1;
size_t numel = 0;
std::vector<LoDTensor> lod_tensors;
std::vector<const LoDTensor *> lod_tensors;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *s = local_scopes_[i];
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
lod_tensors.emplace_back(lod_tensor);
lod_tensors.emplace_back(&lod_tensor);
}
if (platform::is_gpu_place(lod_tensors[0].place())) {
if (platform::is_gpu_place(lod_tensors[0]->place())) {
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto &lod_tensor = lod_tensors[i];
auto &lod_tensor = *lod_tensors[i];
void *buffer = const_cast<void *>(lod_tensor.data<void>());
if (dtype == -1) {
@ -93,7 +93,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &scope =

@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#include <string>
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_proto_maker.h"

@ -24,23 +24,23 @@ namespace framework {
namespace details {
struct ReduceLoDTensor {
const std::vector<LoDTensor> &src_tensors_;
const std::vector<const LoDTensor *> &src_tensors_;
LoDTensor &dst_tensor_;
ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
: src_tensors_(src), dst_tensor_(*dst) {}
template <typename T>
void operator()() const {
PADDLE_ENFORCE(!src_tensors_.empty());
auto &t0 = src_tensors_[0];
auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
for (size_t i = 1; i < src_tensors_.size(); ++i) {
auto &t = src_tensors_[i];
auto &t = *src_tensors_[i];
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
PADDLE_ENFORCE_EQ(t.type(), t0.type());
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,

@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace paddle {
namespace framework {
@ -21,85 +23,84 @@ namespace details {
void ReduceOpHandle::RunImpl() {
// the input and output may have dummy var.
std::vector<VarHandle *> in_var_handles = GetValidVarHandles(inputs_);
std::vector<VarHandle *> out_var_handles = GetValidVarHandles(outputs_);
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
// Wait input done, this Wait is asynchronous operation
WaitEvents(in_var_handles);
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
out_var_handle = out_var_handles.front();
}
// check in the same place
auto in_0_handle = in_var_handles[0];
auto pre_place = in_0_handle->place_;
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(in_var_handles);
auto pre_place = in_0_handle->place_;
std::vector<platform::Place> in_places;
auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var);
for (auto *in_handle : in_var_handles) {
auto in_p = in_handle->place_;
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
in_places.emplace_back(in_p);
}
auto out_var = local_scopes_[out_var_handles[0]->scope_idx_]->FindVar(
out_var_handles[0]->name_);
auto in_var =
var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
PADDLE_ENFORCE_NOT_NULL(in_var);
auto pre_in_var =
local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
if (pre_in_var->IsType<framework::SelectedRows>()) {
auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
std::vector<const SelectedRows *> in_selected_rows;
auto in_tensor = VariableVisitor::GetMutableTensor(in_var);
PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(),
"The type of input is not consistent.");
}
for (auto *in_handle : in_var_handles) {
auto in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::SelectedRows>();
auto out_var =
var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
"The type of input is not consistent.");
if (pre_in_var->IsType<framework::SelectedRows>()) {
std::vector<const SelectedRows *> in_selected_rows =
GetInputValues<SelectedRows>(in_var_handles, var_scopes);
in_selected_rows.emplace_back(&in_sr);
}
auto trg = out_var->GetMutable<framework::SelectedRows>();
GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
out_var_handles[0]->place_, trg);
out_var_handle->place_,
out_var->GetMutable<framework::SelectedRows>());
} else {
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
std::vector<LoDTensor> lod_tensors;
// can be refined
for (auto *in_handle : in_var_handles) {
auto in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(in_sr.type(), pre_in.type(),
"The type of input is not consistent.");
lod_tensors.emplace_back(in_sr);
}
auto trg = out_var->GetMutable<framework::LoDTensor>();
trg->Resize(pre_in.dims());
trg->mutable_data(out_var_handles[0]->place_, pre_in.type());
std::vector<const LoDTensor *> lod_tensors =
GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(pre_place)) {
ReduceLoDTensor func(lod_tensors, trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
ReduceLoDTensor func(lod_tensors,
out_var->GetMutable<framework::LoDTensor>());
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
} else if (paddle::platform::is_gpu_place(pre_place)) {
#ifdef PADDLE_WITH_CUDA
auto out_p = out_var_handles[0]->place_;
int root = boost::get<platform::CUDAPlace>(out_p).device;
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data(
out_var_handle->place_, pre_in.type());
auto out_p = out_var_handle->place_;
int root = boost::get<platform::CUDAPlace>(out_p).device;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i];
auto &lod_tensor = lod_tensors[i];
auto &lod_tensor = *lod_tensors[i];
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
@ -109,14 +110,16 @@ void ReduceOpHandle::RunImpl() {
void *buffer = const_cast<void *>(lod_tensor.data<void>());
void *recvbuffer = nullptr;
if (root == dev_id) {
recvbuffer = trg->mutable_data(out_var_handles[0]->place_);
recvbuffer =
out_var->GetMutable<framework::LoDTensor>()->mutable_data(
out_var_handle->place_);
}
int type = platform::ToNCCLDataType(lod_tensor.type());
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclReduce(
buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
platform::ToNCCLDataType(lod_tensor.type()), ncclSum, root, comm,
stream));
static_cast<ncclDataType_t>(type), ncclSum, root, comm, stream));
});
}
@ -135,26 +138,31 @@ void ReduceOpHandle::RunImpl() {
}
}
void ReduceOpHandle::WaitEvents(
const std::vector<VarHandle *> &in_var_handles) {
if (in_var_handles[0]->generated_op_) {
for (auto *in : in_var_handles) {
in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]);
}
template <typename T>
std::vector<const T *> ReduceOpHandle::GetInputValues(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<const Scope *> &var_scopes) const {
std::vector<const T *> in_selected_rows;
for (auto *in_handle : in_var_handles) {
auto &in_sr = var_scopes.at(in_handle->scope_idx_)
->FindVar(in_handle->name_)
->Get<T>();
in_selected_rows.emplace_back(&in_sr);
}
return in_selected_rows;
}
std::vector<VarHandle *> ReduceOpHandle::GetValidVarHandles(
const std::vector<VarHandleBase *> &inputs) {
std::vector<VarHandle *> in_var_handles;
for (auto *in : inputs) {
auto *in_handle = dynamic_cast<VarHandle *>(in);
if (in_handle) {
in_var_handles.push_back(in_handle);
void ReduceOpHandle::WaitInputVarGenerated(
const std::vector<VarHandle *> &in_var_handles) {
for (auto *in : in_var_handles) {
if (in->generated_op_) {
for (auto pair : dev_ctxes_) {
in->generated_op_->Wait(pair.second);
}
}
}
return in_var_handles;
}
std::string ReduceOpHandle::Name() const { return "reduce"; }
} // namespace details
} // namespace framework

@ -59,10 +59,13 @@ struct ReduceOpHandle : public OpHandleBase {
protected:
void RunImpl() override;
std::vector<VarHandle *> GetValidVarHandles(
const std::vector<VarHandleBase *> &inputs);
void WaitEvents(const std::vector<VarHandle *> &in_var_handles);
void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
template <typename T>
std::vector<const T *> GetInputValues(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<const Scope *> &var_scopes) const;
};
} // namespace details

@ -14,7 +14,6 @@
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
@ -30,6 +29,7 @@ struct TestReduceOpHandle {
bool use_gpu_;
Scope g_scope_;
std::vector<Scope *> local_scopes_;
std::vector<Scope *> param_scopes_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_;
@ -83,12 +83,18 @@ struct TestReduceOpHandle {
}
}
void InitReduceOp(size_t input_scope_idx) {
void InitReduceOp(size_t out_scope_idx) {
// init scope
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
Scope &local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope *>() = &local_scope;
local_scope.Var("input");
param_scopes_.emplace_back(&local_scope);
}
local_scopes_[input_scope_idx]->Var("input");
param_scopes_[out_scope_idx]->Var("out");
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
@ -106,6 +112,7 @@ struct TestReduceOpHandle {
#endif
}
// init op handle
// add input
for (size_t j = 0; j < gpu_list_.size(); ++j) {
if (!use_gpu_) {
@ -126,7 +133,7 @@ struct TestReduceOpHandle {
// add output
auto *out_var_handle =
new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
@ -148,7 +155,8 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@ -161,10 +169,11 @@ struct TestReduceOpHandle {
value->Resize(kDims);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
out_selected_rows->mutable_value()->ShareDataWith(
@ -202,7 +211,8 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod);
@ -211,10 +221,11 @@ struct TestReduceOpHandle {
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
auto in_lodtensor = in_var->Get<f::LoDTensor>();
out_lodtensor->ShareDataWith(in_lodtensor);
@ -239,34 +250,34 @@ struct TestReduceOpHandle {
TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceSelectedRows(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceSelectedRows(out_scope_idx);
}
TEST(ReduceTester, TestCPUReduceTestLodTensor) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceLodTensors(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceLodTensors(out_scope_idx);
}
#ifdef PADDLE_WITH_CUDA
TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceSelectedRows(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceSelectedRows(out_scope_idx);
}
TEST(ReduceTester, TestGPUReduceTestLodTensor) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceLodTensors(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceLodTensors(out_scope_idx);
}
#endif

@ -202,8 +202,9 @@ class CosineOpComplete : public paddle::framework::CosineOp {
};
TEST(OperatorRegistrar, Test) {
using namespace paddle::framework;
OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
paddle::framework::OperatorRegistrar<
CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
reg("cos");
}
namespace paddle {

@ -226,10 +226,8 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
// test with multi inputs
TEST(OpKernel, multi_inputs) {
using namespace paddle::framework;
paddle::framework::InitDevices(true);
proto::OpDesc op_desc;
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_multi_inputs_with_kernel");
BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
@ -243,12 +241,12 @@ TEST(OpKernel, multi_inputs) {
paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope;
scope.Var("x0")->GetMutable<LoDTensor>();
scope.Var("x1")->GetMutable<LoDTensor>();
scope.Var("x2")->GetMutable<LoDTensor>();
scope.Var("k0")->GetMutable<LoDTensor>();
scope.Var("y0")->GetMutable<LoDTensor>();
scope.Var("y1")->GetMutable<LoDTensor>();
scope.Var("x0")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("x1")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("x2")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("k0")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("y0")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("y1")->GetMutable<paddle::framework::LoDTensor>();
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cpu_place);

@ -27,10 +27,14 @@ BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
return blocks_.back().get();
}
proto::ProgramDesc *ProgramDesc::Proto() {
void ProgramDesc::Flush() {
for (auto &block : blocks_) {
block->Flush();
}
}
proto::ProgramDesc *ProgramDesc::Proto() {
Flush();
return &desc_;
}

@ -51,6 +51,8 @@ class ProgramDesc {
size_t Size() const { return blocks_.size(); }
void Flush();
proto::ProgramDesc *Proto();
// The output variable of feed_op is referenced as feed_target.

@ -15,14 +15,14 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <atomic>
#include "threadpool.h"
#include "paddle/fluid/framework/threadpool.h"
namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
std::vector<std::future<void>> fs;
for (int i = 0; i < cnt; ++i) {
fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
}
}
@ -46,7 +46,7 @@ TEST(ThreadPool, ConcurrentRun) {
int n = 50;
// sum = (n * (n + 1)) / 2
for (int i = 1; i <= n; ++i) {
std::thread t(do_sum, pool, std::ref(sum), i);
std::thread t(do_sum, pool, &sum, i);
threads.push_back(std::move(t));
}
for (auto& t : threads) {

@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h"
#include <algorithm>
#include <fstream>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
@ -27,14 +28,14 @@ namespace inference {
// linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
void ReadBinaryFile(const std::string& filename, std::string& contents) {
void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
fin.seekg(0, std::ios::end);
contents.clear();
contents.resize(fin.tellg());
contents->clear();
contents->resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&contents[0], contents.size());
fin.read(&(contents->at(0)), contents->size());
fin.close();
}
@ -47,7 +48,7 @@ bool IsPersistable(const framework::VarDesc* var) {
return false;
}
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename) {
@ -92,18 +93,18 @@ void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
op->CheckAttrs();
}
executor.Run(*load_program, &scope, 0, true, true);
executor->Run(*load_program, scope, 0, true, true);
delete load_program;
}
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, program_desc_str);
ReadBinaryFile(model_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
@ -113,11 +114,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
}
std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor& executor, framework::Scope& scope,
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_filename, const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str);
ReadBinaryFile(model_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));

@ -27,17 +27,17 @@ namespace inference {
void Init(bool init_p2p);
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
const std::string& dirname);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
const std::string& prog_filename,
const std::string& param_filename);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save