Support LoDTensorArray in fetch (#23645)

* Support LoDTEnsorArray in fetch op

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop

* Support LoDTensorArray in fetch

test=develop
revert-22778-infer_var_type
guofei 5 years ago committed by GitHub
parent 0b0adbf9b6
commit 2b896c1f6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -197,13 +197,27 @@ FetchResultType AsyncSSAGraphExecutor::Run(
HandleException();
FeedFetchList ret;
auto &val = boost::get<FeedFetchList>(fetch_data);
FetchList ret;
auto &val = boost::get<FetchList>(fetch_data);
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&val.at(fetch_idx));
ret.emplace_back();
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
if (data_is_lod_tensor(val.at(fetch_idx))) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&(boost::get<LoDTensor>(val.at(fetch_idx))));
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
auto array = boost::get<LoDTensorArray>(val.at(fetch_idx));
LoDTensorArray item_array;
item_array.reserve(array.size());
for (size_t i = 0; i < array.size(); ++i) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.push_back(&array[i]);
item_array.emplace_back();
item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
}
ret.emplace_back(item_array);
}
}
return ret;
}

@ -63,7 +63,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
FetchResultType fetches;
if (return_merged) {
fetches = FeedFetchList(fetch_tensors.size());
fetches = FetchList(fetch_tensors.size());
} else {
fetches = FetchUnmergedList(fetch_tensors.size());
}

@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
}
void FetchOpHandle::WaitAndMergeCPUTensors() const {
static void CheckDims(const framework::DDim &tensor_dims,
const framework::DDim &ele_dims, const size_t offset) {
PADDLE_ENFORCE_EQ(
tensor_dims.size(), ele_dims.size(),
platform::errors::Fatal("The dimension sizes of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices. And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
offset));
for (int j = 1; j < tensor_dims.size(); j++) {
PADDLE_ENFORCE_EQ(
tensor_dims[j], ele_dims[j],
platform::errors::Fatal("The dimensions of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices. And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
offset));
}
}
void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
if (return_merged_) {
const auto &tensor_dims = tensors_[0].dims();
for (size_t i = 1; i < tensors_.size(); i++) {
const auto &ele_dims = tensors_[i].dims();
PADDLE_ENFORCE_EQ(
tensor_dims.size(), ele_dims.size(),
platform::errors::Fatal("The dimension sizes of fetched Tensors are "
"different from each other on different "
"devices. And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
offset_));
for (int j = 1; j < tensor_dims.size(); j++) {
PADDLE_ENFORCE_EQ(
tensor_dims[j], ele_dims[j],
platform::errors::Fatal("The dimensions of fetched Tensors are "
"different from each other on different "
"devices. And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
offset_));
if (data_is_lod_tensor(tensors_[0])) {
const auto &tensor_dims = boost::get<LoDTensor>(tensors_[0]).dims();
for (size_t i = 1; i < tensors_.size(); i++) {
const auto &ele_dims = boost::get<LoDTensor>(tensors_[i]).dims();
CheckDims(tensor_dims, ele_dims, offset_);
}
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&boost::get<LoDTensor>(t));
}
auto &val = boost::get<FetchList>(*data_);
LoDTensor var;
var.MergeLoDTensor(tensors_ptr, platform::CPUPlace());
val.at(offset_) = std::move(var);
} else {
auto &array = boost::get<LoDTensorArray>(tensors_[0]);
LoDTensorArray tmp_array;
tmp_array.reserve(array.size());
for (size_t i = 0; i < array.size(); ++i) {
const auto &tensor_dims = array[i].dims();
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
tensors_ptr.push_back(&array[i]);
for (size_t j = 1; j < tensors_.size(); ++j) {
auto &element = boost::get<LoDTensorArray>(tensors_[j]);
const auto &ele_dims = element[i].dims();
CheckDims(tensor_dims, ele_dims, offset_);
tensors_ptr.push_back(&element[i]);
}
tmp_array.emplace_back();
tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace());
}
auto &val = boost::get<FetchList>(*data_);
val.at(offset_) = std::move(tmp_array);
}
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&t);
}
auto &val = boost::get<FeedFetchList>(*data_);
val.at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
} else {
auto &val = boost::get<FetchUnmergedList>(*data_);
val.at(offset_) = std::move(tensors_);
}
}
static void TransData(const framework::LoDTensor &src_item,
framework::LoDTensor *dst_item) {
if (src_item.IsInitialized() && src_item.numel() > 0) {
if (platform::is_gpu_place(src_item.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
dst_item->ShareDataWith(src_item);
}
} else {
dst_item->clear();
dst_item->Resize({0});
}
dst_item->set_lod(src_item.lod());
}
void FetchOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated(platform::CPUPlace());
tensors_.resize(inputs_.size());
platform::CPUPlace cpu;
auto &scopes = *local_exec_scopes_;
for (size_t i = 0; i < inputs_.size(); ++i) {
@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() {
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_handle->name());
auto &t = var->Get<framework::LoDTensor>();
if (t.IsInitialized() && t.numel() > 0) {
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, &tensors_[i]);
#endif
} else {
tensors_[i].ShareDataWith(t);
}
if (var->IsType<LoDTensor>()) {
auto &t = var->Get<framework::LoDTensor>();
auto &item = boost::get<LoDTensor>(tensors_[i]);
TransData(t, &item);
} else {
tensors_[i].clear();
tensors_[i].Resize({0});
auto &t = var->Get<framework::LoDTensorArray>();
LoDTensorArray tmp(t.size());
tensors_[i] = tmp;
auto &item = boost::get<LoDTensorArray>(tensors_[i]);
for (size_t j = 0; j < t.size(); ++j) {
TransData(t[j], &item[j]);
}
}
tensors_[i].set_lod(t.lod());
}
this->WaitAndMergeCPUTensors();
this->WaitAndMergeCPUFetchVars();
}
void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {

@ -36,7 +36,7 @@ struct FetchOpHandle : public OpHandleBase {
void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
void WaitAndMergeCPUTensors() const;
void WaitAndMergeCPUFetchVars() const;
std::string Name() const override;
@ -54,7 +54,7 @@ struct FetchOpHandle : public OpHandleBase {
size_t offset_;
std::vector<Scope *> *local_scopes_;
std::vector<Scope *> *local_exec_scopes_;
std::vector<LoDTensor> tensors_;
std::vector<FetchType> tensors_;
bool return_merged_;
};

@ -179,7 +179,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
}
if (return_merged) {
return FeedFetchList();
return FetchList();
} else {
return FetchUnmergedList();
}
@ -245,22 +245,43 @@ FetchResultType ParallelSSAGraphExecutor::Run(
}
if (return_merged) {
FeedFetchList ret;
FetchList ret;
ret.reserve(fetch_tensors.size());
for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
std::vector<const LoDTensor *> lodtensor_ptrs;
lodtensor_ptrs.reserve(place_num);
std::vector<const LoDTensorArray *> lodtensorarray_ptrs;
lodtensorarray_ptrs.reserve(place_num);
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
if (!is_valid[scope_idx]) {
continue;
}
const auto &fetch_list =
boost::get<FeedFetchList>(fetch_data[scope_idx]);
lodtensor_ptrs.push_back(&fetch_list[fetch_idx]);
const auto &fetch_list = boost::get<FetchList>(fetch_data[scope_idx]);
if (data_is_lod_tensor(fetch_list[fetch_idx])) {
lodtensor_ptrs.push_back(
&(boost::get<LoDTensor>(fetch_list[fetch_idx])));
} else {
lodtensorarray_ptrs.push_back(
&(boost::get<LoDTensorArray>(fetch_list[fetch_idx])));
}
}
if (lodtensor_ptrs.size() != 0) {
LoDTensor var;
var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
ret.emplace_back(var);
} else {
LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) {
LoDTensor var;
std::vector<const LoDTensor *> ptrs;
for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) {
ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i)));
}
var.MergeLoDTensor(ptrs, platform::CPUPlace());
var_array[i] = std::move(var);
}
ret.emplace_back(var_array);
}
ret.emplace_back();
ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
}
return ret;
} else {
@ -277,8 +298,8 @@ FetchResultType ParallelSSAGraphExecutor::Run(
boost::get<FetchUnmergedList>(fetch_data[scope_idx]);
PADDLE_ENFORCE_EQ(
fetch_list[fetch_idx].size(), 1,
platform::errors::Fatal(
"Each place must have only one fetched LoDTensor!"));
platform::errors::Fatal("Each place must have only one fetched "
"LoDTensor/LoDTensorArray!"));
ret.back().emplace_back(fetch_list[fetch_idx][0]);
}
}

@ -72,7 +72,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
std::unordered_set<VarHandleBase *> fetch_dependencies;
FetchResultType fetch_data;
if (return_merged) {
fetch_data = FeedFetchList(fetch_tensors.size());
fetch_data = FetchList(fetch_tensors.size());
} else {
fetch_data = FetchUnmergedList(fetch_tensors.size());
}

@ -256,7 +256,7 @@ static bool has_feed_operators(
// Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators(
const BlockDesc& block,
const std::map<std::string, LoDTensor*>& fetch_targets,
const std::map<std::string, FetchType*>& fetch_targets,
const std::string& fetch_holder_name) {
size_t fetch_count = 0;
for (auto* op : block.AllOps()) {
@ -306,7 +306,7 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope, bool create_vars,
const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
@ -504,7 +504,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
void Executor::RunPreparedContext(
ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
std::map<std::string, FetchType*>* fetch_targets, bool create_local_scope,
bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
auto& global_block = ctx->prog_.Block(ctx->block_id_);

@ -87,7 +87,7 @@ class Executor {
// This API is very slow.
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope = true, bool create_vars = true,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
@ -95,7 +95,7 @@ class Executor {
// This API is very slow.
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
std::map<std::string, LoDTensor*>* fetch_targets,
std::map<std::string, FetchType*>* fetch_targets,
bool create_local_scope = true,
bool create_vars = true,
const std::string& feed_holder_name = "feed",

@ -29,7 +29,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
@ -39,27 +39,35 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must
// Since we want to fetch FetchType from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
PADDLE_ENFORCE_NOT_NULL(g_fetch_value,
platform::errors::NotFound(
"Variable %s is not found in scope.", var_name));
PADDLE_ENFORCE_EQ(g_fetch_value->IsType<FetchList>(), true,
platform::errors::InvalidArgument(
"Only %s can be invoked by GetFetchVariable",
typeid(FetchList).name()));
auto& fetch_outputs = *g_fetch_value->GetMutable<FetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
VLOG(3) << "Fetch " << var_name << " with index " << index;
PADDLE_ENFORCE_LT(index, fetch_outputs.size(),
platform::errors::InvalidArgument(
"index must less than fetch_outputs size."));
return tensor;
}
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
Variable* var = scope.FindVar(var_name);
PADDLE_ENFORCE(var, "%s no in scope", var_name);
PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
platform::errors::InvalidArgument(
"Only support lod tensor in GetVariableTensor now."));
return *var->GetMutable<LoDTensor>();
}

@ -24,7 +24,7 @@ namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index);
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);

@ -15,14 +15,33 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>;
using FetchUnmergedList = std::vector<std::vector<FeedFetchType>>;
using FetchResultType = boost::variant<FeedFetchList, FetchUnmergedList>;
using FeedType = LoDTensor;
using FeedList = std::vector<FeedType>;
using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
using FetchList = std::vector<FetchType>;
using FetchUnmergedList = std::vector<std::vector<FetchType>>;
using FetchResultType = boost::variant<FetchList, FetchUnmergedList>;
inline bool data_is_lod_tensor(const FetchType &data) {
if (data.type() == typeid(LoDTensor)) {
return true;
}
return false;
}
inline bool data_is_lod_tensor_array(const FetchType &data) {
if (data.type() == typeid(LoDTensorArray)) {
return true;
}
return false;
}
static const char kFeedOpType[] = "feed";
static const char kFetchOpType[] = "fetch";

@ -20,7 +20,6 @@ namespace paddle {
namespace framework {
using LoDTensorArray = std::vector<LoDTensor>;
using LoDTensor2DArray = std::vector<std::vector<LoDTensor>>;
} // namespace framework
} // namespace paddle

@ -36,6 +36,7 @@ inline proto::VarType::Type ToVarType(int type) {
case proto::VarType::SELECTED_ROWS:
case proto::VarType::LOD_RANK_TABLE:
case proto::VarType::LOD_TENSOR_ARRAY:
case proto::VarType::FETCH_LIST:
case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type);
default:
@ -61,6 +62,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
case proto::VarType::READER:
visitor(var.Get<ReaderHolder>());
return;
case proto::VarType::FETCH_LIST:
visitor(var.Get<FetchList>());
return;
default:
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
}

@ -19,6 +19,7 @@
#include <tuple>
#include <typeindex>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h"
@ -139,7 +140,7 @@ struct VarTypeRegistryImpl {
using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder,
operators::reader::LoDTensorBlockingQueueHolder, FetchList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
@ -178,6 +179,7 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);

@ -34,9 +34,9 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
var->GetMutable<FeedList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
var->GetMutable<FetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {

@ -383,8 +383,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
for (size_t i = 0; i < fetches_.size(); ++i) {
int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::FetchType &fetch_var =
framework::GetFetchVariable(*scope, "fetch", idx);
auto &fetch = boost::get<framework::LoDTensor>(fetch_var);
auto type = fetch.type();
auto output = &(outputs->at(i));
output->name = fetches_[idx]->Input("X")[0];
@ -583,9 +584,9 @@ void AnalysisPredictor::PrepareFeedFetch() {
void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
PADDLE_ENFORCE_NOT_NULL(scope);
auto *var = scope->Var("feed");
var->GetMutable<framework::FeedFetchList>();
var->GetMutable<framework::FeedList>();
var = scope->Var("fetch");
var->GetMutable<framework::FeedFetchList>();
var->GetMutable<framework::FetchList>();
}
std::vector<std::string> AnalysisPredictor::GetInputNames() {

@ -286,8 +286,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::FetchType &fetch_var =
framework::GetFetchVariable(*scope, "fetch", idx);
auto fetch = boost::get<framework::LoDTensor>(fetch_var);
auto type = fetch.type();
auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0];

@ -102,14 +102,15 @@ void MainWord2Vec(bool use_gpu) {
cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word);
framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
float* lod_data = output1.data<float>();
for (int i = 0; i < output1.numel(); ++i) {
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
float* lod_data = output1_tensor.data<float>();
for (int i = 0; i < output1_tensor.numel(); ++i) {
EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
}
@ -137,8 +138,8 @@ void MainImageClassification(bool use_gpu) {
std::vector<framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
framework::LoDTensor output1;
std::vector<framework::LoDTensor*> cpu_fetchs1;
framework::FetchType output1;
std::vector<framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace, false, true>(
@ -153,7 +154,8 @@ void MainImageClassification(bool use_gpu) {
ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>();
float* lod_data =
boost::get<paddle::framework::LoDTensor>(output1).data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
}
@ -168,7 +170,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
constexpr int num_jobs = 3;
std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs);
std::vector<framework::FetchType> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) {
// each job has 4 words
jobs[i].resize(4);
@ -181,7 +183,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
// get reference result of each job
std::vector<paddle::framework::LoDTensor*> ref_feeds;
std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
std::vector<paddle::framework::FetchType*> ref_fetches(1, &refs[i]);
for (auto& word : jobs[i]) {
ref_feeds.push_back(&word);
}
@ -207,9 +209,10 @@ void MainThreadsWord2Vec(bool use_gpu) {
}
// check outputs correctness
float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < refs[tid].numel(); ++i) {
auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
float* ref_data = ref_tensor.data<float>();
EXPECT_EQ(ref_tensor.numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < ref_tensor.numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 2e-3);
}
});
@ -230,7 +233,7 @@ void MainThreadsImageClassification(bool use_gpu) {
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs);
std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
std::vector<framework::LoDTensor> refs(num_jobs);
std::vector<framework::FetchType> refs(num_jobs);
for (size_t i = 0; i < jobs.size(); ++i) {
// prepare inputs
std::vector<std::vector<int64_t>> feed_target_shapes =
@ -242,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
// get reference result of each job
std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
std::vector<framework::FetchType*> ref_fetches(1, &refs[i]);
TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
}
@ -259,9 +262,10 @@ void MainThreadsImageClassification(bool use_gpu) {
ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data());
float* ref_data = refs[tid].data<float>();
EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) {
auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
float* ref_data = ref_tensor.data<float>();
EXPECT_EQ((size_t)ref_tensor.numel(), len / sizeof(float));
for (int i = 0; i < ref_tensor.numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
}
});

@ -40,10 +40,10 @@ TEST(inference, fit_a_line) {
cpu_feeds[i].push_back(input);
}
std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs1;
cpu_fetchs1.resize(num_threads);
for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor();
auto* output = new paddle::framework::FetchType();
cpu_fetchs1[i].push_back(output);
}
@ -58,10 +58,10 @@ TEST(inference, fit_a_line) {
}
#ifdef PADDLE_WITH_CUDA
std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs2;
cpu_fetchs2.resize(num_threads);
for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor();
auto* output = new paddle::framework::FetchType();
cpu_fetchs2[i].push_back(output);
}
@ -76,7 +76,9 @@ TEST(inference, fit_a_line) {
}
for (int i = 0; i < num_threads; ++i) {
CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
CheckError<float>(
boost::get<paddle::framework::LoDTensor>(*cpu_fetchs1[i][0]),
boost::get<paddle::framework::LoDTensor>(*cpu_fetchs2[i][0]));
delete cpu_fetchs2[i][0];
}
#endif

@ -50,9 +50,9 @@ TEST(inference, image_classification) {
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
paddle::framework::FetchType output1;
if (!FLAGS_skip_cpu) {
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
@ -60,12 +60,12 @@ TEST(inference, image_classification) {
LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CPUPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output1).dims();
}
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
@ -73,17 +73,18 @@ TEST(inference, image_classification) {
LOG(INFO) << "Batch size is " << FLAGS_batch_size;
TestInference<paddle::platform::CUDAPlace, false, true>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output2).dims();
if (!FLAGS_skip_cpu) {
CheckError<float>(output1, output2);
CheckError<float>(boost::get<paddle::framework::LoDTensor>(output1),
boost::get<paddle::framework::LoDTensor>(output2));
}
// float16 inference requires cuda GPUs with >= 5.3 compute capability
if (!FLAGS_fp16_dirname.empty() &&
paddle::platform::GetCUDAComputeCapability(0) >= 53) {
paddle::framework::LoDTensor output3;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
paddle::framework::FetchType output3;
std::vector<paddle::framework::FetchType*> cpu_fetchs3;
cpu_fetchs3.push_back(&output3);
LOG(INFO) << "--- GPU Runs in float16 mode: ---";
@ -92,7 +93,8 @@ TEST(inference, image_classification) {
TestInference<paddle::platform::CUDAPlace, false, true>(
FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
CheckError<float>(output2, output3);
CheckError<float>(boost::get<paddle::framework::LoDTensor>(output2),
boost::get<paddle::framework::LoDTensor>(output3));
}
#endif
}

@ -63,25 +63,27 @@ TEST(inference, label_semantic_roles) {
cpu_feeds.push_back(&ctx_p2);
cpu_feeds.push_back(&mark);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}

@ -118,8 +118,8 @@ void ThreadRunInfer(
inference_program->GetFetchTargetNames();
PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
paddle::framework::LoDTensor outtensor;
std::map<std::string, paddle::framework::FetchType*> fetch_targets;
paddle::framework::FetchType outtensor;
fetch_targets[fetch_target_names[0]] = &outtensor;
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
@ -150,7 +150,8 @@ void ThreadRunInfer(
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
paddle::framework::GetFetchVariable(*scope, "fetch", idx);
boost::get<paddle::framework::LoDTensor>(
paddle::framework::GetFetchVariable(*scope, "fetch", idx));
}
}
@ -215,8 +216,8 @@ TEST(inference, nlp) {
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
paddle::framework::LoDTensor outtensor;
std::map<std::string, paddle::framework::FetchType*> fetch_targets;
paddle::framework::FetchType outtensor;
fetch_targets[fetch_target_names[0]] = &outtensor;
// prepare feed

@ -41,28 +41,30 @@ TEST(inference, recognize_digits) {
cpu_feeds.push_back(&input);
for (auto is_combined : {false, true}) {
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}
}

@ -65,23 +65,25 @@ TEST(inference, recommender_system) {
cpu_feeds.push_back(&category_id);
cpu_feeds.push_back(&movie_title);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}

@ -41,25 +41,27 @@ TEST(inference, rnn_encoder_decoder) {
cpu_feeds.push_back(&word_data);
cpu_feeds.push_back(&trg_word);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
paddle::framework::FetchType output1;
std::vector<paddle::framework::FetchType*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
LOG(INFO) << output1_tensor.lod();
LOG(INFO) << output1_tensor.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
paddle::framework::FetchType output2;
std::vector<paddle::framework::FetchType*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
LOG(INFO) << output2_tensor.lod();
LOG(INFO) << output2_tensor.dims();
CheckError<float>(output1, output2);
CheckError<float>(output1_tensor, output2_tensor);
#endif
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save