You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
455 lines
17 KiB
455 lines
17 KiB
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <map>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "paddle/fluid/inference/api/api_anakin_engine.h"
|
|
#include "paddle/fluid/inference/api/paddle_api.h"
|
|
|
|
#include "framework/core/net/net.h"
|
|
#include "framework/operators/ops.h"
|
|
#include "saber/funcs/timer.h"
|
|
|
|
namespace paddle {
|
|
|
|
using paddle::contrib::AnakinConfig;
|
|
template <typename T, Precision P, OpRunType R>
|
|
extern std::mutex PaddleInferenceAnakinPredictor<T, P, R>::mutex_;
|
|
template <typename T, Precision P, OpRunType R>
|
|
extern std::once_flag PaddleInferenceAnakinPredictor<T, P, R>::init_anakin_;
|
|
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::InitEnv() {
|
|
std::call_once(this->init_anakin_, [this]() {
|
|
anakin::Env<T>::env_init(this->config_.max_stream);
|
|
});
|
|
anakin::TargetWrapper<T>::set_device(this->config_.device_id);
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::InitNet() {
|
|
std::unique_lock<std::mutex> lock(this->mutex_);
|
|
this->executor_p_ = new anakin::Net<T, P, R>(*this->graph_p_, true);
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::SetContext() {
|
|
this->ctx_p_ = std::make_shared<anakin::Context<T>>(
|
|
this->config_.device_id, this->config_.data_stream_id,
|
|
this->config_.compute_stream_id);
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::InitGraph() {
|
|
this->graph_p_ =
|
|
std::make_shared<anakin::graph::Graph<T, anakin::Precision::FP32>>();
|
|
if (!this->config_.model_file.empty()) {
|
|
this->graph_p_->load(this->config_.model_file);
|
|
} else if (this->config_.model_buf_p) {
|
|
this->graph_p_->load(this->config_.model_buf_p,
|
|
this->config_.model_buf_len);
|
|
} else {
|
|
LOG(FATAL) << "Model load error.";
|
|
}
|
|
this->input_names_ = this->graph_p_->get_ins();
|
|
this->output_names_ = this->graph_p_->get_outs();
|
|
for (auto &input_str : this->input_names_) {
|
|
if (this->config_.init_inputs_shape.find(input_str) ==
|
|
this->config_.init_inputs_shape.end()) {
|
|
LOG(FATAL) << input_str << " should be set in init_inputs_shape.";
|
|
}
|
|
std::vector<int> shape =
|
|
this->config_.init_inputs_shape.find(input_str)->second;
|
|
this->graph_p_->Reshape(input_str, shape);
|
|
}
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::OptimizeGraph() {
|
|
if (!this->graph_p_->Optimize()) {
|
|
LOG(FATAL) << "Graph optimization error.";
|
|
}
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::InitPredictor() {
|
|
this->InitEnv();
|
|
this->SetContext();
|
|
this->InitGraph();
|
|
this->OptimizeGraph();
|
|
this->InitNet();
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinPredictor<T, P, R>::Predict() {
|
|
anakin::TargetWrapper<T>::device_sync();
|
|
this->executor_p_->prediction();
|
|
anakin::TargetWrapper<T>::device_sync();
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
bool PaddleInferenceAnakinPredictor<T, P, R>::Run(
|
|
const std::vector<PaddleTensor> &inputs,
|
|
std::vector<PaddleTensor> *output_data, int batch_size) {
|
|
if (this->config_.re_allocable) {
|
|
return this->RunImpl(inputs, output_data);
|
|
} else {
|
|
// Run inputs data that exceeds batch size in batches.
|
|
// 1. Reassign the batch size.
|
|
if (batch_size == -1) {
|
|
if (!inputs[0].lod.empty()) {
|
|
batch_size = inputs[0].lod[0].size() - 1;
|
|
} else {
|
|
batch_size = inputs[0].shape[0];
|
|
}
|
|
}
|
|
// 2. If the data don't need to be batched, run it directly.
|
|
if (batch_size <= this->config_.init_batch_size) {
|
|
return this->RunImpl(inputs, output_data);
|
|
}
|
|
// 3. Check the batch size and define temporary variables.
|
|
std::vector<PaddleTensor> cur_inputs;
|
|
std::vector<PaddleTensor> outputs_master;
|
|
std::vector<std::vector<paddle::PaddleTensor>> outputs_vec;
|
|
for (const auto &input : inputs) {
|
|
if (!input.lod.empty()) {
|
|
if (input.lod.size() != 1) {
|
|
return false;
|
|
}
|
|
if (input.lod[0].size() - 1 != batch_size) {
|
|
return false;
|
|
}
|
|
} else {
|
|
LOG(INFO) << "Non-lod mode to be implemented.";
|
|
return false;
|
|
}
|
|
PaddleTensor tensor;
|
|
tensor.name = input.name;
|
|
tensor.dtype = PaddleDType::FLOAT32;
|
|
cur_inputs.push_back(tensor);
|
|
}
|
|
for (auto output : *output_data) {
|
|
PaddleTensor tensor;
|
|
tensor.name = output.name;
|
|
outputs_master.push_back(tensor);
|
|
}
|
|
// 4. Batch execution.
|
|
for (size_t start_batch = 0; start_batch < batch_size;) {
|
|
auto end_batch = start_batch + this->config_.init_batch_size;
|
|
if (end_batch > batch_size) {
|
|
end_batch = batch_size;
|
|
}
|
|
auto cur_outputs = outputs_master;
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
auto start = inputs[i].lod[0][start_batch];
|
|
auto end = inputs[i].lod[0][end_batch];
|
|
std::vector<size_t> offsets;
|
|
for (size_t j = start_batch; j <= end_batch; j++) {
|
|
offsets.push_back(inputs[i].lod[0][j] -
|
|
inputs[i].lod[0][start_batch]);
|
|
}
|
|
auto mem_start = static_cast<float *>(inputs[i].data.data()) + start;
|
|
cur_inputs[i].data =
|
|
PaddleBuf(mem_start, (end - start) * sizeof(float));
|
|
cur_inputs[i].lod = std::vector<std::vector<size_t>>({offsets});
|
|
cur_inputs[i].shape =
|
|
std::vector<int>({static_cast<int>(end - start), 1, 1, 1});
|
|
}
|
|
if (!this->RunImpl(cur_inputs, &cur_outputs)) {
|
|
return false;
|
|
}
|
|
outputs_vec.push_back(cur_outputs);
|
|
start_batch = end_batch;
|
|
}
|
|
// 5. Copy the results to contiguous memory.
|
|
// Assume that each batch has the same final outputs size.
|
|
auto count = [](const std::vector<int> &v) {
|
|
int cnt = 1;
|
|
for_each(v.begin(), v.end(), [&cnt](int n) { cnt *= n; });
|
|
return cnt;
|
|
};
|
|
for (size_t i = 0; i < output_data->size(); i++) {
|
|
std::vector<int> shape = outputs_vec[i][0].shape;
|
|
shape[0] = batch_size;
|
|
int total_cnt = count(shape);
|
|
(*output_data)[i].shape = shape;
|
|
(*output_data)[i].data.Resize(total_cnt * sizeof(float));
|
|
float *addr = static_cast<float *>((*output_data)[i].data.data());
|
|
for (const auto &single_out : outputs_vec) {
|
|
int cnt = count(single_out[i].shape);
|
|
memcpy(addr, single_out[i].data.data(), cnt * sizeof(float));
|
|
addr += cnt;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
bool PaddleInferenceAnakinPredictor<T, P, R>::RunImpl(
|
|
const std::vector<PaddleTensor> &inputs,
|
|
std::vector<PaddleTensor> *output_data) {
|
|
anakin::TargetWrapper<T>::set_device(this->config_.device_id);
|
|
for (const auto &input : inputs) {
|
|
if (input.dtype != PaddleDType::FLOAT32) {
|
|
LOG(FATAL) << "Only support float type inputs. " << input.name
|
|
<< "'s type is not float";
|
|
}
|
|
auto d_tensor_p = this->executor_p_->get_in(input.name);
|
|
auto net_shape = d_tensor_p->valid_shape();
|
|
if (net_shape.size() != input.shape.size()) {
|
|
LOG(FATAL) << " input " << input.name
|
|
<< "'s shape size should be equal to that of net";
|
|
}
|
|
int sum = 1;
|
|
for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
|
|
if (sum > net_shape.count()) {
|
|
if (this->config_.re_allocable) {
|
|
this->graph_p_->Reshape(input.name, input.shape);
|
|
delete this->executor_p_;
|
|
this->InitNet();
|
|
d_tensor_p = this->executor_p_->get_in(input.name);
|
|
} else {
|
|
LOG(FATAL)
|
|
<< "Run failed because Anakin was expected not to reallocate "
|
|
"memory.";
|
|
}
|
|
}
|
|
std::vector<int> tmp_shape;
|
|
for (auto s : input.shape) {
|
|
tmp_shape.push_back(s);
|
|
}
|
|
auto *data = static_cast<float *>(input.data.data());
|
|
anakin::saber::Tensor<typename anakin::DefaultHostType<T>::Host_type>
|
|
h_tensor(data, typename anakin::DefaultHostType<T>::Host_type(), 0,
|
|
tmp_shape);
|
|
d_tensor_p->reshape(tmp_shape);
|
|
|
|
if (input.lod.size() > 0) {
|
|
if (input.lod.size() > 1) {
|
|
LOG(FATAL) << " input lod first dim should <=1, but you set "
|
|
<< input.lod.size();
|
|
}
|
|
std::vector<int> lod(input.lod[0].begin(), input.lod[0].end());
|
|
std::vector<std::vector<int>> offset({lod});
|
|
d_tensor_p->set_seq_offset(offset);
|
|
VLOG(3) << "offset.size(): " << offset[0].size();
|
|
for (int i = 0; i < offset[0].size(); i++) {
|
|
VLOG(3) << offset[0][i];
|
|
}
|
|
}
|
|
d_tensor_p->copy_from(h_tensor);
|
|
}
|
|
this->Predict();
|
|
if (output_data->empty()) {
|
|
LOG(FATAL) << "At least one output should be set with tensors' names.";
|
|
}
|
|
for (auto &output : *output_data) {
|
|
if (std::find(this->output_names_.begin(), this->output_names_.end(),
|
|
output.name) == this->output_names_.end()) {
|
|
LOG(FATAL) << output.name << " is not in the outputs of the graph.";
|
|
}
|
|
auto *d_tensor_p = this->executor_p_->get_out(output.name);
|
|
output.shape = d_tensor_p->valid_shape();
|
|
if (output.data.length() < d_tensor_p->valid_size() * sizeof(float)) {
|
|
output.data.Resize(d_tensor_p->valid_size() * sizeof(float));
|
|
}
|
|
auto *data = static_cast<float *>(output.data.data());
|
|
anakin::saber::Tensor<typename anakin::DefaultHostType<T>::Host_type>
|
|
h_tensor(data, typename anakin::DefaultHostType<T>::Host_type(), 0,
|
|
d_tensor_p->valid_shape());
|
|
h_tensor.copy_from(*d_tensor_p);
|
|
}
|
|
return true;
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
bool PaddleInferenceAnakinPredictor<T, P, R>::Reset(
|
|
PaddleInferenceAnakinPredictor<T, P, R> *predictor) {
|
|
this->config_ = predictor->GetConfig();
|
|
this->graph_p_ = predictor->GetGraph();
|
|
this->input_names_ = predictor->GetInputNames();
|
|
this->output_names_ = predictor->GetOutputNames();
|
|
this->ctx_p_ = std::make_shared<anakin::Context<T>>(
|
|
this->config_.device_id, this->config_.data_stream_id,
|
|
this->config_.compute_stream_id);
|
|
this->InitNet();
|
|
return true;
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
std::unique_ptr<PaddlePredictor>
|
|
PaddleInferenceAnakinPredictor<T, P, R>::New() {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinPredictor<T, P, R>());
|
|
}
|
|
// the cloned new Predictor of anakin share the same net weights from original
|
|
// Predictor
|
|
template <typename T, Precision P, OpRunType R>
|
|
std::unique_ptr<PaddlePredictor>
|
|
PaddleInferenceAnakinPredictor<T, P, R>::Clone() {
|
|
VLOG(3) << "Anakin Predictor::clone";
|
|
std::unique_ptr<PaddlePredictor> cls = std::move(this->New());
|
|
auto anakin_predictor_p =
|
|
dynamic_cast<PaddleInferenceAnakinPredictor<T, P, R> *>(cls.get());
|
|
if (!anakin_predictor_p) {
|
|
LOG(FATAL) << "fail to call Init";
|
|
}
|
|
anakin_predictor_p->Reset(this);
|
|
return cls;
|
|
}
|
|
|
|
#ifdef ANAKIN_MLU_PLACE
|
|
template <Precision P, OpRunType R>
|
|
std::unique_ptr<PaddlePredictor>
|
|
PaddleInferenceAnakinMLUPredictor<P, R>::New() {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinMLUPredictor<P, R>());
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinMLUPredictor<P, R>::SetContext() {
|
|
this->ctx_p_ = std::make_shared<anakin::Context<anakin::MLU>>(
|
|
this->config_.device_id, this->config_.data_stream_id,
|
|
this->config_.compute_stream_id);
|
|
this->ctx_p_->set_model_parallel(this->config_.model_parallel);
|
|
this->ctx_p_->set_fusion(this->config_.op_fuse);
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinMLUPredictor<P, R>::OptimizeGraph() {
|
|
if (!this->graph_p_->fusion_optimize(this->config_.op_fuse)) {
|
|
LOG(FATAL) << "Graph optimization error.";
|
|
}
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinMLUPredictor<P, R>::InitNet() {
|
|
std::unique_lock<std::mutex> lock(this->mutex_);
|
|
this->executor_p_ = new anakin::Net<anakin::MLU, P, R>();
|
|
this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true);
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinMLUPredictor<P, R>::Predict() {
|
|
anakin::TargetWrapper<anakin::MLU>::device_sync();
|
|
this->executor_p_->fusion_prediction();
|
|
anakin::TargetWrapper<anakin::MLU>::device_sync();
|
|
}
|
|
#endif
|
|
|
|
#ifdef ANAKIN_BM_PLACE
|
|
template <Precision P, OpRunType R>
|
|
std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinBMPredictor<P, R>::New() {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinBMPredictor<P, R>());
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinBMPredictor<P, R>::OptimizeGraph() {
|
|
if (!this->graph_p_->fusion_optimize()) {
|
|
LOG(FATAL) << "Graph optimization error.";
|
|
}
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinBMPredictor<P, R>::InitNet() {
|
|
std::unique_lock<std::mutex> lock(this->mutex_);
|
|
this->executor_p_ = new anakin::Net<anakin::BM, P, R>();
|
|
this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true);
|
|
}
|
|
template <Precision P, OpRunType R>
|
|
void PaddleInferenceAnakinBMPredictor<P, R>::Predict() {
|
|
anakin::TargetWrapper<anakin::BM>::device_sync();
|
|
this->executor_p_->fusion_prediction();
|
|
anakin::TargetWrapper<anakin::BM>::device_sync();
|
|
}
|
|
#endif
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
template class PaddleInferenceAnakinPredictor<
|
|
anakin::NV, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>;
|
|
#endif
|
|
#ifdef ANAKIN_X86_PLACE
|
|
template class PaddleInferenceAnakinPredictor<
|
|
anakin::X86, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>;
|
|
#endif
|
|
#ifdef ANAKIN_MLU_PLACE
|
|
template class PaddleInferenceAnakinMLUPredictor<anakin::Precision::FP32,
|
|
::anakin::OpRunType::SYNC>;
|
|
#endif
|
|
#ifdef ANAKIN_BM_PLACE
|
|
template class PaddleInferenceAnakinBMPredictor<anakin::Precision::FP32,
|
|
::anakin::OpRunType::ASYNC>;
|
|
#endif
|
|
|
|
// A factory to help create difference predictor.
|
|
template <>
|
|
std::unique_ptr<PaddlePredictor>
|
|
CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
|
|
const contrib::AnakinConfig &config) {
|
|
#ifdef PADDLE_WITH_CUDA
|
|
if (config.target_type == contrib::AnakinConfig::NVGPU) {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinPredictor<anakin::NV, anakin::Precision::FP32,
|
|
::anakin::OpRunType::ASYNC>(config));
|
|
}
|
|
#endif
|
|
#ifdef ANAKIN_X86_PLACE
|
|
if (config.target_type == contrib::AnakinConfig::X86) {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinPredictor<anakin::X86, anakin::Precision::FP32,
|
|
::anakin::OpRunType::ASYNC>(config));
|
|
}
|
|
#endif
|
|
#ifdef ANAKIN_MLU_PLACE
|
|
if (config.target_type == contrib::AnakinConfig::MLU) {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinMLUPredictor<anakin::Precision::FP32,
|
|
::anakin::OpRunType::SYNC>(
|
|
config));
|
|
}
|
|
#endif
|
|
#ifdef ANAKIN_BM_PLACE
|
|
if (config.target_type == contrib::AnakinConfig::BM) {
|
|
return std::unique_ptr<PaddlePredictor>(
|
|
new PaddleInferenceAnakinBMPredictor<anakin::Precision::FP32,
|
|
::anakin::OpRunType::ASYNC>(
|
|
config));
|
|
}
|
|
#endif
|
|
LOG(FATAL) << "Anakin Predictor create on unknown platform: "
|
|
<< config.target_type;
|
|
return nullptr;
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
void DisplayOpTimer(anakin::Net<T, P, R> *net_executor, int epoch) {
|
|
#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
|
|
std::vector<float> op_time = net_executor->get_op_time();
|
|
auto exec_funcs = net_executor->get_exec_funcs();
|
|
auto op_param = net_executor->get_op_param();
|
|
for (int i = 0; i < op_time.size(); i++) {
|
|
LOG(INFO) << "name: " << exec_funcs[i].name
|
|
<< " op_type: " << exec_funcs[i].op_name
|
|
<< " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
|
|
}
|
|
std::map<std::string, float> op_map;
|
|
for (int i = 0; i < op_time.size(); i++) {
|
|
auto it = op_map.find(op_param[i]);
|
|
if (it != op_map.end())
|
|
op_map[op_param[i]] += op_time[i];
|
|
else
|
|
op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
|
|
}
|
|
for (auto it = op_map.begin(); it != op_map.end(); ++it) {
|
|
LOG(INFO) << it->first << " " << (it->second) / epoch << " ms";
|
|
}
|
|
#endif
|
|
}
|
|
template <typename T, Precision P, OpRunType R>
|
|
PaddleInferenceAnakinPredictor<T, P, R>::~PaddleInferenceAnakinPredictor() {
|
|
DisplayOpTimer<T, P, R>(this->executor_p_, this->config_.init_batch_size);
|
|
delete this->executor_p_;
|
|
this->executor_p_ = nullptr;
|
|
}
|
|
|
|
} // namespace paddle
|