Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into accelerate_ddpg

test=develop
6 years ago · 8ed0233924
parent 68b86d6665 9c6a0203e2
commit 8ed0233924
108 changed files with 2800 additions and 253 deletions
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@ -60,7 +60,7 @@ class Float16Transpiler:
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
            raise TypeError("scope should be as Scope type or None")
        self.scope = scope
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -351,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
 paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
@ -447,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -48,10 +48,10 @@ if(WITH_GPU)
    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
    add_dependencies(tensor tensor_util)
  else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
  endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@ -84,6 +84,7 @@ cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@ -165,7 +165,7 @@ template <typename T>
 class GreaterThanChecker {
 public:
  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
  }
@ -177,7 +177,7 @@ template <typename T>
 class EqualGreaterThanChecker {
 public:
  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }
@ -193,7 +193,7 @@ class DefaultValueSetter {
 public:
  explicit DefaultValueSetter(T default_value)
      : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }  // NOLINT
+  void operator()(T* value) const { *value = default_value_; }
 private:
  T default_value_;
@ -203,7 +203,7 @@ template <typename T>
 class EnumInContainer {
 public:
  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(T& val) const {
+  void operator()(const T& val) const {
    PADDLE_ENFORCE(container_.find(val) != container_.end(),
                   "Value %s is not in enum container %s", val,
                   ContainerDebugString());
@ -232,7 +232,8 @@ class EnumInContainer {
 // an attribute can have more than one limits
 template <typename T>
 class TypedAttrChecker {
-  typedef std::function<void(T&)> ValueChecker;
+  typedef std::function<void(T*)> DefaultValueChecker;
  typedef std::function<void(const T&)> ValueChecker;
 public:
  explicit TypedAttrChecker(const std::string& attr_name)
@ -268,17 +269,17 @@ class TypedAttrChecker {
    return *this;
  }
-  void operator()(AttributeMap& attr_map) const {  // NOLINT
+  void operator()(AttributeMap* attr_map) const {
-    if (!attr_map.count(attr_name_)) {
+    if (!attr_map->count(attr_name_)) {
      // user do not set this attr
      PADDLE_ENFORCE(!default_value_setter_.empty(),
                     "Attribute '%s' is required!", attr_name_);
      // default_value_setter_ has no more than one element
      T val;
-      (default_value_setter_[0])(val);
+      (default_value_setter_[0])(&val);
-      attr_map[attr_name_] = val;
+      (*attr_map)[attr_name_] = val;
    }
-    Attribute& attr = attr_map.at(attr_name_);
+    Attribute& attr = attr_map->at(attr_name_);
    ExtractAttribute<T> extract_attr(attr_name_);
    T* attr_value = extract_attr(attr);
    for (const auto& checker : value_checkers_) {
@ -289,12 +290,12 @@ class TypedAttrChecker {
 private:
  std::string attr_name_;
  std::vector<ValueChecker> value_checkers_;
-  std::vector<ValueChecker> default_value_setter_;
+  std::vector<DefaultValueChecker> default_value_setter_;
 };
 // check whether op's all attributes fit their own limits
 class OpAttrChecker {
-  typedef std::function<void(AttributeMap&)> AttrChecker;
+  typedef std::function<void(AttributeMap*)> AttrChecker;
 public:
  template <typename T>
@ -304,7 +305,7 @@ class OpAttrChecker {
    return *(checker.target<TypedAttrChecker<T>>());
  }
-  void Check(AttributeMap& attr_map) const {  // NOLINT
+  void Check(AttributeMap* attr_map) const {
    for (const auto& checker : attr_checkers_) {
      checker(attr_map);
    }
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          BuildStrategy::GradientScaleStrategy::kCustomized) {
        // TODO(paddle-dev): Why is there no input for this op_handle?
        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
+        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
                              out_dtype);
      }
      // This assumes the backward generating code will ensure IsScaleLossOp
      // is true only for the op that scale the final scalar loss.
@ -658,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
    ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node) const {
+    ir::Node *out_var_node, proto::VarType::Type dtype) const {
  for (size_t i = 0; i < places_.size(); ++i) {
    // Insert ScaleCost OpHandle
    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
    auto *op_handle = new ScaleLossGradOpHandle(
        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx);
+        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
    // FIXME: Currently ScaleLossGradOp only use device_count as scale
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void CreateScaleLossGradOp(ir::Graph *result,
                             const std::string &loss_grad_name,
-                             ir::Node *out_var_node) const;
+                             ir::Node *out_var_node,
                             proto::VarType::Type dtype) const;
  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                            int dst_dev_id) const;
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@ -22,39 +22,66 @@ namespace details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
                                             Scope *scope,
                                             platform::Place place,
-                                             platform::DeviceContext *dev_ctx)
+                                             platform::DeviceContext *dev_ctx,
                                             proto::VarType::Type dtype)
    : OpHandleBase(node),
      coeff_(static_cast<float>(1.0 / num_dev)),
      scope_(scope),
-      place_(place) {
+      place_(place),
      out_dtype_(dtype) {
  this->SetDeviceContext(place_, dev_ctx);
 }
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
-void ScaleLossGradOpHandle::RunImpl() {
+struct ScaleLossGradFunctor {
-  // Doesn't wait any event
+  float coeff_;
-  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  Tensor *out_;
-  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  platform::Place place_;
  OpHandleBase *op_handle_;
  proto::VarType::Type out_dtype_;
  platform::DeviceContext *ctx_;
-  float *tmp = local_scope.FindVar(var_name)
+  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
-                   ->GetMutable<LoDTensor>()
+                       OpHandleBase *op_handle, proto::VarType::Type dtype,
-                   ->mutable_data<float>(make_ddim({1}), place_);
+                       platform::DeviceContext *ctx)
      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
  template <typename OutT>
  void apply() const {
    auto *out_data = out_->mutable_data<OutT>(place_);
    if (platform::is_cpu_place(place_)) {
-    *tmp = coeff_;
+      *out_data = static_cast<OutT>(coeff_);
    } else {
 #ifdef PADDLE_WITH_CUDA
-    this->RunAndRecordEvent([&] {
+      OutT cast_coeff = static_cast<OutT>(coeff_);
-      auto stream = static_cast<platform::CUDADeviceContext *>(
+      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
-                        this->dev_ctxes_.at(place_))
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
-                        ->stream();
+                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                   stream);
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
      VLOG(10) << place_ << "RUN Scale loss grad op";
-    });
+
 #endif
    }
  }
 };
 void ScaleLossGradOpHandle::RunImpl() {
  // Doesn't wait any event
  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
  auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
  tensor->Resize(make_ddim({1}));
 #ifdef PADDLE_WITH_CUDA
  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
                            this->dev_ctxes_.at(place_));
  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
 #else
  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
  framework::VisitDataType(out_dtype_, func);
 #endif
 }
 std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@ -26,8 +26,8 @@ namespace details {
 struct ScaleLossGradOpHandle : public OpHandleBase {
  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place,
+                        platform::Place place, platform::DeviceContext *context,
-                        platform::DeviceContext *context);
+                        proto::VarType::Type dtype);
  ~ScaleLossGradOpHandle() final;
@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
  float coeff_;
  Scope *scope_;
  platform::Place place_;
  proto::VarType::Type out_dtype_;
 };
 }  // namespace details
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@ -31,10 +31,12 @@ std::map<std::string,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
        {"mul", paddle::operators::ngraphs::BuildMulNode},
        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
+        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
 void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
  auto& op_type = op->Type();
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() {
    // not by users.
    return;
  }
-  checker->Check(attrs_);
+  checker->Check(&attrs_);
 }
 void OpDesc::InferShape(const BlockDesc &block) const {
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
  AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
      .SetDefault("");
  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
                                    "Callstack for Op Creatation.")
      .SetDefault({});
  Validate();
 }
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker {
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
  static const char *OpNamescopeAttrName() { return "op_namescope"; }
  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
    const VariableNameMap& outputs, AttributeMap attrs) {
  auto& info = OpInfoMap::Instance().Get(type);
  if (info.Checker() != nullptr) {
-    info.Checker()->Check(attrs);
+    info.Checker()->Check(&attrs);
  }
  auto op = info.Creator()(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -16,10 +16,15 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <algorithm>
-
+#include <sstream>
 #include <string>
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@ -157,7 +162,10 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
 }
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  try {
    if (VLOG_IS_ON(4)) {
      VLOG(4) << place << " " << DebugStringEx(&scope);
    }
    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW("Cannot run operator on place %s", place);
@ -167,17 +175,46 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
    }
-  // The profile has a process-wide mutex, results in serious performance issue
+    // The profile has a process-wide mutex, results in serious performance
    // issue
    // in concurrency scenerio. Here use an `if` to fix this issue.
    // Please not remove the `if`, ask @Superjomn if there are any concern.
    if (platform::IsProfileEnabled()) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
      platform::RecordEvent record_event(Type(), pool.Get(place));
      RunImpl(scope, place);
    } else {
      RunImpl(scope, place);
    }
    if (VLOG_IS_ON(3)) {
      VLOG(3) << place << " " << DebugStringEx(&scope);
    }
  } catch (platform::EnforceNotMet exception) {
    if (Attrs().count("sub_block") != 0) {
      throw exception;
    }
    auto& callstack = Attr<std::vector<std::string>>(
        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
    if (callstack.empty()) {
      throw exception;
    }
    std::ostringstream sout;
    sout << "Invoke operator " << Type() << " error.\n";
    sout << "Python Callstacks: \n";
    for (auto& line : callstack) {
      sout << line;
    }
    sout << "C++ Callstacks: \n";
    sout << exception.err_str_;
    exception.err_str_ = sout.str();
    throw exception;
  } catch (...) {
    std::rethrow_exception(std::current_exception());
  }
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
@ -1057,8 +1094,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &(var->Get<SelectedRows>().value());
        }
        if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
-                         ipt_name, DebugString());
+                         ipt_name);
          int tmp = static_cast<int>(t->type());
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,
--- a/paddle/fluid/framework/scope_pool.cc
+++ b/paddle/fluid/framework/scope_pool.cc
@ -0,0 +1,54 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/scope_pool.h"
 #include "paddle/fluid/framework/threadpool.h"
 namespace paddle {
 namespace framework {
 ScopePool &ScopePool::Instance() {  // NOLINT
  static ScopePool pool;
  return pool;
 }
 void ScopePool::DeleteScope(Scope *scope) { delete scope; }
 void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
  std::lock_guard<std::mutex> guard(mtx_);
  scopes_.insert(s.release());
 }
 void ScopePool::Remove(Scope *s) {
  size_t has_scope;
  {
    std::lock_guard<std::mutex> guard(mtx_);
    has_scope = scopes_.erase(s);
  }
  PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
  DeleteScope(s);
 }
 ScopePool::~ScopePool() { Clear(); }
 void ScopePool::Clear() {
  std::lock_guard<std::mutex> guard(mtx_);
  for (auto *s : scopes_) {
    DeleteScope(s);
  }
  scopes_.clear();
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/scope_pool.h
+++ b/paddle/fluid/framework/scope_pool.h
@ -0,0 +1,46 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <mutex>  // NOLINT
 #include <unordered_set>
 #include "paddle/fluid/framework/scope.h"
 namespace paddle {
 namespace framework {
 class ScopePool {
 public:
  static ScopePool &Instance();  // NOLINT
  void Insert(std::unique_ptr<Scope> &&s);
  void Remove(Scope *s);
  void Clear();
  ~ScopePool();
 private:
  ScopePool() = default;
  static void DeleteScope(Scope *scope);
  std::unordered_set<Scope *> scopes_;
  std::mutex mtx_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
      "or maybe the required data-type mismatches the data already stored.");
 }
-Tensor::Tensor(std::type_index type)
+Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
    : type_(framework::ToDataType(type)), offset_(0) {}
 size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -69,7 +69,7 @@ class Tensor {
 public:
  Tensor() : type_(proto::VarType::FP32), offset_(0) {}
-  explicit Tensor(std::type_index type);
+  explicit Tensor(const proto::VarType::Type&);
  /*! Return a pointer to mutable memory block. */
  template <typename T>
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/temporary_allocator.h"
 namespace paddle {
 namespace framework {
@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
               src_ptr, size);
 }
 template <typename T>
 paddle::framework::Tensor GetTensor(
    memory::allocation::AllocationPtr temp_allocation_ptr,
    const framework::DDim& dim) {
  auto& deleter = temp_allocation_ptr.get_deleter();
  auto* allocation_ptr = temp_allocation_ptr.release();
  auto shared_allocation =
      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
  PADDLE_ENFORCE(
      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
      "The AllocationPtr must be TemporaryAllocation.");
  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
                    framework::product(dim) * sizeof(T));
  paddle::framework::Tensor temp_tensor(
      framework::ToDataType(std::type_index(typeid(T))));
  temp_tensor.Resize(dim);
  temp_tensor.ResetHolder(std::move(shared_allocation));
  return temp_tensor;
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                  inputs[i].data.length());
    } else {
 #ifdef PADDLE_WITH_CUDA
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
+                   inputs[i].data.length(), dev_ctx->stream());
                   0);  // stream 0 for sync copy
 #else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                  inputs[i].data.length());
    } else {
 #ifdef PADDLE_WITH_CUDA
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
-                   inputs[i].data.length(),
+                   inputs[i].data.length(), dev_ctx->stream());
                   0);  // stream 0 for sync copy
 #else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
 #endif
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
 download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
 # MM DNN
 set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
 download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
@ -103,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 # seq_pool1
 inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1
 "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz")
 # mobilenet with depthwise_conv op
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@ -0,0 +1,178 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle {
 namespace inference {
 using contrib::AnalysisConfig;
 struct DataRecord {
  std::vector<std::vector<int64_t>> query_data_all, title_data_all;
  std::vector<size_t> lod1, lod2;
  size_t batch_iter{0};
  size_t batch_size{1};
  size_t num_samples;  // total number of samples
  DataRecord() = default;
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
    Load(path);
  }
  DataRecord NextBatch() {
    DataRecord data;
    size_t batch_end = batch_iter + batch_size;
    // NOTE skip the final batch, if no enough data is provided.
    if (batch_end <= query_data_all.size()) {
      data.query_data_all.assign(query_data_all.begin() + batch_iter,
                                 query_data_all.begin() + batch_end);
      data.title_data_all.assign(title_data_all.begin() + batch_iter,
                                 title_data_all.begin() + batch_end);
      // Prepare LoDs
      data.lod1.push_back(0);
      data.lod2.push_back(0);
      CHECK(!data.query_data_all.empty());
      CHECK(!data.title_data_all.empty());
      CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
      for (size_t j = 0; j < data.query_data_all.size(); j++) {
        // calculate lod
        data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
        data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
      }
    }
    batch_iter += batch_size;
    return data;
  }
  void Load(const std::string &path) {
    std::ifstream file(path);
    std::string line;
    int num_lines = 0;
    while (std::getline(file, line)) {
      num_lines++;
      std::vector<std::string> data;
      split(line, '\t', &data);
      // load query data
      std::vector<int64_t> query_data;
      split_to_int64(data[0], ' ', &query_data);
      // load title data
      std::vector<int64_t> title_data;
      split_to_int64(data[1], ' ', &title_data);
      query_data_all.push_back(std::move(query_data));
      title_data_all.push_back(std::move(title_data));
    }
    num_samples = num_lines;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                   int batch_size) {
  PaddleTensor lod_query_tensor, lod_title_tensor;
  lod_query_tensor.name = "left";
  lod_title_tensor.name = "right";
  auto one_batch = data->NextBatch();
  int size1 = one_batch.lod1[one_batch.lod1.size() - 1];  // token batch size
  int size2 = one_batch.lod2[one_batch.lod2.size() - 1];  // token batch size
  lod_query_tensor.shape.assign({size1, 1});
  lod_query_tensor.lod.assign({one_batch.lod1});
  lod_title_tensor.shape.assign({size2, 1});
  lod_title_tensor.lod.assign({one_batch.lod2});
  // assign data
  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
  // Set inputs.
  input_slots->assign({lod_query_tensor, lod_title_tensor});
  for (auto &tensor : *input_slots) {
    tensor.dtype = PaddleDType::INT64;
  }
 }
 void SetConfig(contrib::AnalysisConfig *cfg) {
  cfg->model_dir = FLAGS_infer_model;
  cfg->use_gpu = false;
  cfg->device = 0;
  cfg->specify_input_name = true;
  cfg->enable_ir_optim = true;
 }
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
  std::vector<PaddleTensor> input_slots;
  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
  for (int bid = 0; bid < epoch; ++bid) {
    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
    (*inputs).emplace_back(input_slots);
  }
 }
 // Easy for profiling independently.
 TEST(Analyzer_MM_DNN, profile) {
  contrib::AnalysisConfig cfg;
  SetConfig(&cfg);
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
    for (auto &output : outputs) {
      size_t size = GetSize(output);
      PADDLE_ENFORCE_GT(size, 0);
      float *result = static_cast<float *>(output.data.data());
      // output is probability, which is in (-1, 1).
      for (size_t i = 0; i < size; i++) {
        EXPECT_GT(result[i], -1);
        EXPECT_LT(result[i], 1);
      }
    }
  }
 }
 // Check the fuse status
 TEST(Analyzer_MM_DNN, fuse_statis) {
  contrib::AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
  auto fuse_statis = GetFuseStatis(
      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
 }
 // Compare result of NativeConfig and AnalysisConfig
 TEST(Analyzer_MM_DNN, compare) {
  contrib::AnalysisConfig cfg;
  SetConfig(&cfg);
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  CompareNativeAndAnalysis(
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 // Compare Deterministic result
 TEST(Analyzer_MM_DNN, compare_determine) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                       input_slots_all);
 }
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@ -0,0 +1,117 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
 void SetConfig(AnalysisConfig *cfg) {
  cfg->param_file = FLAGS_infer_model + "/params";
  cfg->prog_file = FLAGS_infer_model + "/model";
  cfg->use_gpu = false;
  cfg->device = 0;
  cfg->enable_ir_optim = true;
  cfg->specify_input_name = true;
  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  std::vector<std::string> feed_names = {
      "slot10000_embed", "slot10001_embed", "slot10004_embed",
      "slot10005_embed", "slot10008_embed", "slot10009_embed",
      "slot10012_embed", "slot10013_embed", "slot10108_embed",
      "slot13324_embed", "slot13325_embed", "slot13326_embed",
      "slot13327_embed", "slot13328_embed", "slot13329_embed",
      "slot13330_embed", "slot13331_embed", "slot15501_embed",
      "slot15502_embed", "slot15503_embed", "slot15504_embed",
      "slot15505_embed", "slot15506_embed", "slot15507_embed",
      "slot15508_embed", "slot15516_embed", "slot15519_embed",
      "slot15523_embed", "slot15531_embed", "slot15533_embed",
      "slot15548_embed", "slot15564_embed", "slot15565_embed",
      "slot15566_embed", "slot15570_embed", "slot15571_embed",
      "slot15572_embed", "slot15573_embed", "slot15574_embed",
      "slot15575_embed", "slot15576_embed", "slot15577_embed",
      "slot15579_embed", "slot15581_embed", "slot15582_embed",
      "slot15583_embed", "slot15584_embed", "slot5016_embed",
      "slot5021_embed",  "slot6002_embed",  "slot6003_embed",
      "slot6004_embed",  "slot6005_embed",  "slot6006_embed",
      "slot6007_embed",  "slot6008_embed",  "slot6009_embed",
      "slot6011_embed",  "slot6014_embed",  "slot6015_embed",
      "slot6023_embed",  "slot6024_embed",  "slot6025_embed",
      "slot6027_embed",  "slot6029_embed",  "slot6031_embed",
      "slot6034_embed",  "slot6035_embed",  "slot6036_embed",
      "slot6037_embed",  "slot6039_embed",  "slot6048_embed",
      "slot6050_embed",  "slot6058_embed",  "slot6059_embed",
      "slot6060_embed",  "slot6066_embed",  "slot6067_embed",
      "slot6068_embed",  "slot6069_embed",  "slot6070_embed",
      "slot6071_embed",  "slot6072_embed",  "slot6073_embed",
      "slot6182_embed",  "slot6183_embed",  "slot6184_embed",
      "slot6185_embed",  "slot6186_embed",  "slot6188_embed",
      "slot6189_embed",  "slot6190_embed",  "slot6201_embed",
      "slot6202_embed",  "slot6203_embed",  "slot6247_embed",
      "slot6248_embed",  "slot6250_embed",  "slot6251_embed",
      "slot6807_embed",  "slot6808_embed",  "slot6809_embed",
      "slot6810_embed",  "slot6811_embed",  "slot6812_embed",
      "slot6813_embed",  "slot6814_embed",  "slot6815_embed",
      "slot6816_embed",  "slot6817_embed",  "slot6818_embed",
      "slot6819_embed",  "slot6820_embed",  "slot6822_embed",
      "slot6823_embed",  "slot6826_embed",  "slot7002_embed",
      "slot7003_embed",  "slot7004_embed",  "slot7005_embed",
      "slot7006_embed",  "slot7008_embed",  "slot7009_embed",
      "slot7010_embed",  "slot7011_embed",  "slot7013_embed",
      "slot7014_embed",  "slot7015_embed",  "slot7016_embed",
      "slot7017_embed",  "slot7019_embed",  "slot7100_embed",
      "slot7506_embed",  "slot7507_embed",  "slot7514_embed",
      "slot7515_embed",  "slot7516_embed"};
  SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params",
                    &feed_names);
 }
 // Easy for profiling independently.
 void profile(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  if (use_mkldnn) {
    cfg.EnableMKLDNN();
  }
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                 input_slots_all, &outputs, FLAGS_num_threads);
 }
 TEST(Analyzer_seq_pool1, profile) { profile(); }
 // Check the fuse status
 TEST(Analyzer_seq_pool1, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
  auto fuse_statis = GetFuseStatis(
      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  LOG(INFO) << "num_ops: " << num_ops;
  EXPECT_EQ(num_ops, 314);
 }
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@ -132,7 +132,8 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                       const std::string &dirname, bool is_combined = true,
                       std::string model_filename = "model",
-                       std::string params_filename = "params") {
+                       std::string params_filename = "params",
                       const std::vector<std::string> *feed_names = nullptr) {
  // Set fake_image_data
  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
@ -146,26 +147,32 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
    os << "}\n";
  }
  LOG(INFO) << os.str();
-
+  if (feed_names) {
-  int dim1 = feed_target_shapes[0][1];
+    PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
-  int dim2 = feed_target_shapes[0][2];
+  }
-  int dim3 = feed_target_shapes[0][3];
+  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
-
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
-  PaddleTensor input;
+    const auto &feed_shape = feed_target_shapes[i];
-  std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
+    auto &input = input_slots[i];
    std::vector<int> shape({FLAGS_batch_size});
    for (size_t s = 1; s < feed_shape.size(); ++s) {
      shape.push_back(static_cast<int>(feed_shape[s]));
    }
    if (feed_names) {
      input.name = (*feed_names)[i];
    }
    input.shape = shape;
    input.dtype = PaddleDType::FLOAT32;
-
+    size_t len = std::accumulate(shape.begin(), shape.end(), 1,
-  // fill input data, for profile easily, do not use random data here.
+                                 [](int a, int b) { return a * b; });
-  size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
+    input.data.Resize(len * sizeof(float));
-  input.data.Resize(size * sizeof(float));
+    input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
    float *input_data = static_cast<float *>(input.data.data());
-  for (size_t i = 0; i < size; i++) {
+    // fill input data, for profile easily, do not use random data here.
-    *(input_data + i) = static_cast<float>(i) / size;
+    for (size_t j = 0; j < len; ++j) {
      *(input_data + j) = static_cast<float>(j) / len;
    }
  }
  std::vector<PaddleTensor> input_slots;
  input_slots.assign({input});
  (*inputs).emplace_back(input_slots);
 }
--- a/Show More
+++ b/Show More