|
|
|
@ -33,6 +33,10 @@ limitations under the License. */
|
|
|
|
|
#include "paddle/fluid/framework/var_type.h"
|
|
|
|
|
#include "paddle/fluid/platform/profiler.h"
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
|
|
|
#include "paddle/fluid/platform/mkldnn_helper.h"
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
DECLARE_bool(benchmark);
|
|
|
|
|
DECLARE_bool(check_nan_inf);
|
|
|
|
|
DECLARE_bool(enable_unused_var_check);
|
|
|
|
@ -1102,11 +1106,8 @@ Scope* OperatorWithKernel::PrepareData(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto& var_name_item : Inputs()) {
|
|
|
|
|
if (no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0) {
|
|
|
|
|
VLOG(7) << "Skip scanning input " << var_name_item.first
|
|
|
|
|
<< " in Operator " << type_;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
bool should_skip_input =
|
|
|
|
|
no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;
|
|
|
|
|
|
|
|
|
|
std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
|
|
|
|
|
|
|
|
|
@ -1120,6 +1121,44 @@ Scope* OperatorWithKernel::PrepareData(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
|
|
|
|
|
|
|
|
|
|
// When no_buffer_ins then checking of Tensor::holder_ is
|
|
|
|
|
// not a thread safe. And for infershape scenario checks
|
|
|
|
|
// to be omitted are not really needed
|
|
|
|
|
if (should_skip_input == true) {
|
|
|
|
|
#ifdef PADDLE_WITH_MKLDNN
|
|
|
|
|
// Var without buffer may be needed
|
|
|
|
|
// for some situation like InferShape().
|
|
|
|
|
// In this situation We cannot skip Var analysis, as
|
|
|
|
|
// MKL-DNN shape of Var may differ from kNHWC Var
|
|
|
|
|
// In such situation corressponding resized Var
|
|
|
|
|
// has to be created and registered
|
|
|
|
|
if ((tensor_in->layout() == DataLayout::kMKLDNN) &&
|
|
|
|
|
(var->IsType<LoDTensor>() == true) &&
|
|
|
|
|
(expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
|
|
|
|
|
(paddle::platform::get_cur_paddle_data_layout() ==
|
|
|
|
|
DataLayout::kNHWC)) {
|
|
|
|
|
// Mixed execution : MKL-DNN and GPU is not supported!
|
|
|
|
|
if (!new_scope) {
|
|
|
|
|
new_scope = &scope.NewScope();
|
|
|
|
|
}
|
|
|
|
|
auto* trans_var = new_scope->Var(var_name);
|
|
|
|
|
input_vars[i] = trans_var;
|
|
|
|
|
auto out = trans_var->GetMutable<LoDTensor>();
|
|
|
|
|
out->Resize(tensor_in->dims());
|
|
|
|
|
platform::MatchShapeToLayout(out, tensor_in->layout(),
|
|
|
|
|
DataLayout::kNHWC);
|
|
|
|
|
VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
|
|
|
|
|
"but kNHWC layout"
|
|
|
|
|
<< var_name_item.first << " in Operator " << type_;
|
|
|
|
|
} else {
|
|
|
|
|
VLOG(7) << "Skip scanning input " << var_name_item.first
|
|
|
|
|
<< " in Operator " << type_;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!tensor_in->IsInitialized()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
@ -1143,14 +1182,17 @@ Scope* OperatorWithKernel::PrepareData(
|
|
|
|
|
// In the inference scenerio, the scopes will be reused across the
|
|
|
|
|
// batches, so the `new_scope` here will result in GPU memroy explosion
|
|
|
|
|
// over the running of operators.
|
|
|
|
|
// We use a thread_local cache to fix that issue, the key in the cache is
|
|
|
|
|
// We use a thread_local cache to fix that issue, the key in the cache
|
|
|
|
|
// is
|
|
|
|
|
// the combination of the `scope` argument, from_kernel_type,
|
|
|
|
|
// target_kernel_type.
|
|
|
|
|
// Have a discussion with @Superjomn or the inference developers if some
|
|
|
|
|
// changes on this logic for this macro might not tested on the other
|
|
|
|
|
// scenerios.
|
|
|
|
|
// If this op is not called by an Executor or ParallelExecutor, it should
|
|
|
|
|
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
|
|
|
|
|
// If this op is not called by an Executor or ParallelExecutor, it
|
|
|
|
|
// should
|
|
|
|
|
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes
|
|
|
|
|
// and
|
|
|
|
|
// variables, that behavior a lot different.
|
|
|
|
|
//
|
|
|
|
|
// To solve issue #15032, have a discussion with @Luotao for cpu
|
|
|
|
@ -1174,15 +1216,14 @@ Scope* OperatorWithKernel::PrepareData(
|
|
|
|
|
// we will create a new cpu tensor in new scope.
|
|
|
|
|
// However, if enable_cache_runtime_context_, we get the cpu tensor each
|
|
|
|
|
// time, not the gpu tensor.
|
|
|
|
|
// Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in
|
|
|
|
|
// Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()`
|
|
|
|
|
// in
|
|
|
|
|
// RunImpl().
|
|
|
|
|
if (enable_cache_runtime_context_) {
|
|
|
|
|
pre_scope_ = nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto* trans_var = new_scope->Var(var_name);
|
|
|
|
|
input_vars[i] = trans_var;
|
|
|
|
|
|
|
|
|
|
Tensor out;
|
|
|
|
|
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
|
|
|
|
|
SetTensorToVariable(*var, out, trans_var);
|
|
|
|
|