host cpu support dynamic shape

5 years ago · 9f5ab8f76f
parent 42cbdfcafc
commit 9f5ab8f76f
17 changed files with 429 additions and 97 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
@ -289,14 +289,14 @@ bool CreateNodeDefBytes(const std::shared_ptr<AnfNode> &anf_node,
  return true;
 }

-uint64_t SetExtInfoShapeType(char *ext_info_buf, uint64_t ext_info_offset) {
+uint64_t SetExtInfoShapeType(char *ext_info_buf, uint64_t ext_info_offset, UnknowShapeOpType type) {
  // deal1: unknown shape type
  auto *info = reinterpret_cast<ExtInfo *>(ext_info_buf + ext_info_offset);
  info->infoType = FWK_ADPT_EXT_SHAPE_TYPE;
  info->infoLen = sizeof(int32_t);
  ext_info_offset += kExtInfoHeadSize;
  auto *shape_type = reinterpret_cast<int32_t *>(ext_info_buf + ext_info_offset);
-  *shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
+  *shape_type = type;
  ext_info_offset += info->infoLen;
  return ext_info_offset;
 }
@ -401,7 +401,11 @@ bool CreateExtInfo(const std::shared_ptr<AnfNode> &anf_node, const std::shared_p
  ext_info.resize(ext_info_len, 0);
  char *ext_info_buf = ext_info.data();

-  ext_info_offset = SetExtInfoShapeType(ext_info_buf, ext_info_offset);
+  UnknowShapeOpType shape_type = UnknowShapeOpType::DEPEND_IN_SHAPE;
+  if (AnfAlgo::GetCNodeName(anf_node) == "Unique") {
+    shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
+  }
+  ext_info_offset = SetExtInfoShapeType(ext_info_buf, ext_info_offset, shape_type);
  ext_info_offset = SetExtInfoInputShapeType(ext_info_buf, ext_info_offset, anf_node, input_num);
  ext_info_offset = SetExtInfoOutputShapeType(ext_info_buf, ext_info_offset, anf_node, output_num);

--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@ -18,6 +18,7 @@
 #include <algorithm>
 #include <map>
 #include <set>
+#include <stack>
 #include "ir/anf.h"
 #include "ir/func_graph.h"
 #include "base/core_ops.h"
@ -30,6 +31,7 @@
 #include "backend/kernel_compiler/kernel_build_info.h"
 #include "common/trans.h"
 #include "abstract/param_validator.h"
+#include "abstract/primitive_infer_map.h"
 #include "pipeline/jit/static_analysis/static_analysis.h"
 #include "utils/trace_base.h"

@ -820,6 +822,8 @@ DeviceAddressPtr AnfRuntimeAlgorithm::GetMutableWorkspaceAddr(const AnfNodePtr &
 void AnfRuntimeAlgorithm::SetOutputInferTypeAndShape(const std::vector<TypeId> &types,
                                                     const std::vector<std::vector<size_t>> &shapes, AnfNode *node) {
  MS_EXCEPTION_IF_NULL(node);
+  auto node_ptr = node->cast<AnfNodePtr>();
+  MS_EXCEPTION_IF_NULL(node_ptr);
  if (types.size() != shapes.size()) {
    MS_LOG(EXCEPTION) << "Types size " << types.size() << "should be same with shapes size " << shapes.size()
                      << " trace: " << trace::DumpSourceLines(node);
@ -829,16 +833,23 @@ void AnfRuntimeAlgorithm::SetOutputInferTypeAndShape(const std::vector<TypeId> &
  } else if (shapes.size() == 1) {
    // single output handle
    ShapeVector shape_int;
+    auto max_shape = GetOutputMaxShape(node_ptr, 0);
+    auto min_shape = GetOutputMinShape(node_ptr, 0);
    std::transform(shapes[0].begin(), shapes[0].end(), std::back_inserter(shape_int), SizeToLong);
-    auto abstract = std::make_shared<AbstractTensor>(TypeIdToType(types[0]), shape_int);
+    auto abstract = std::make_shared<AbstractTensor>(
+      TypeIdToType(types[0]), std::make_shared<abstract::Shape>(shape_int, min_shape, max_shape));
    node->set_abstract(abstract);
  } else {
    // multiple output handle
    std::vector<AbstractBasePtr> abstract_list;
    for (size_t i = 0; i < types.size(); ++i) {
      ShapeVector shape_int;
+      auto max_shape = GetOutputMaxShape(node_ptr, i);
+      auto min_shape = GetOutputMinShape(node_ptr, i);
      std::transform(shapes[i].begin(), shapes[i].end(), std::back_inserter(shape_int), SizeToLong);
-      abstract_list.emplace_back(std::make_shared<AbstractTensor>(TypeIdToType(types[i]), shape_int));
+      auto abstract = std::make_shared<AbstractTensor>(
+        TypeIdToType(types[i]), std::make_shared<abstract::Shape>(shape_int, min_shape, max_shape));
+      abstract_list.emplace_back(abstract);
    }
    auto abstract_tuple = std::make_shared<AbstractTuple>(abstract_list);
    node->set_abstract(abstract_tuple);
@ -1409,7 +1420,7 @@ std::vector<int64_t> AnfRuntimeAlgorithm::GetOutputMinShape(const AnfNodePtr &an
  }
 }

-bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
+bool IsNodeOutputDynamicShape(const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  auto base_shape = node->Shape();
  if (base_shape == nullptr) {
@ -1436,6 +1447,66 @@ bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
  return false;
 }

+bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
+  MS_EXCEPTION_IF_NULL(anf_node_ptr);
+  auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr);
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_with_index = AnfAlgo::GetPrevNodeOutput(anf_node_ptr, i);
+    auto input = input_with_index.first;
+    auto index = input_with_index.second;
+    MS_EXCEPTION_IF_NULL(input);
+
+    auto base_shape = input->Shape();
+    if (base_shape == nullptr) {
+      MS_LOG(INFO) << "Invalid shape ptr, node:" << input->fullname_with_scope();
+      continue;
+    }
+    if (base_shape->isa<abstract::Shape>()) {
+      if (IsShapeDynamic(base_shape->cast<abstract::ShapePtr>())) {
+        return true;
+      }
+    } else if (base_shape->isa<abstract::TupleShape>()) {
+      auto tuple_shape = base_shape->cast<abstract::TupleShapePtr>();
+      MS_EXCEPTION_IF_NULL(tuple_shape);
+
+      if (index >= tuple_shape->size()) {
+        MS_LOG(INFO) << "Node:" << anf_node_ptr->fullname_with_scope() << "Invalid index:" << index
+                     << " and tuple_shape size:" << tuple_shape->size();
+        continue;
+      }
+
+      auto b_shp = (*tuple_shape)[index];
+      if (!b_shp->isa<abstract::Shape>()) {
+        continue;
+      }
+      if (IsShapeDynamic(b_shp->cast<abstract::ShapePtr>())) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(WARNING) << "Node is not a cnode";
+    return false;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  auto in_dynamic = IsNodeInputDynamicShape(cnode);
+  auto out_dynamic = IsNodeOutputDynamicShape(cnode);
+  if (in_dynamic && !AnfAlgo::HasNodeAttr(kAttrInputIsDynamicShape, cnode)) {
+    AnfAlgo::SetNodeAttr(kAttrInputIsDynamicShape, MakeValue(true), cnode);
+    MS_LOG(INFO) << "Set Input Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
+  }
+  if (out_dynamic && !AnfAlgo::HasNodeAttr(kAttrOutputIsDynamicShape, cnode)) {
+    AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(true), cnode);
+    MS_LOG(INFO) << "Set Output Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
+  }
+  return in_dynamic || out_dynamic;
+}
+
 std::vector<size_t> AnfRuntimeAlgorithm::GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) {
  auto device_shape = GetInputDeviceShape(anf_node, index);
  // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse.
@ -1500,5 +1571,50 @@ void AnfRuntimeAlgorithm::GetAllFatherRealNode(const AnfNodePtr &anf_node, std::
    GetAllFatherRealNode(cnode->input(kDependAttachNodeIndex), result, visited);
  }
 }
+
+void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_LOG(INFO) << "InferShape start, node:" << node->DebugString();
+  auto inputs = node->inputs();
+  if (inputs.empty()) {
+    MS_LOG(EXCEPTION) << "Invalid inputs";
+  }
+  AbstractBasePtrList args_spec_list;
+  auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
+  auto input_size = AnfAlgo::GetInputTensorNum(node);
+  for (size_t i = 0; i < input_size; ++i) {
+    auto input_with_index = AnfAlgo::GetPrevNodeOutput(node, i);
+    auto real_input = input_with_index.first;
+    MS_EXCEPTION_IF_NULL(real_input);
+    auto cnode_input = node->input(i + 1);
+    MS_EXCEPTION_IF_NULL(cnode_input);
+    if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) {
+      auto base_shape = real_input->Shape();
+      if (!base_shape->isa<abstract::TupleShape>()) {
+        MS_LOG(EXCEPTION) << "Node:" << node->DebugString()
+                          << " input is a tuple_get_item but real input node shape is not a TupleShape";
+      }
+      auto tuple_ptr = base_shape->cast<abstract::TupleShapePtr>();
+      MS_EXCEPTION_IF_NULL(tuple_ptr);
+      auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
+      auto real_shape = tuple_ptr->shape().at(tuple_get_item_index);
+      auto abstract_tensor = cnode_input->abstract()->cast<abstract::AbstractTensorPtr>();
+      MS_EXCEPTION_IF_NULL(abstract_tensor);
+      args_spec_list.emplace_back(std::make_shared<abstract::AbstractTensor>(abstract_tensor->element(), real_shape));
+    } else if (cnode_input->isa<CNode>() && AnfAlgo::GetCNodeName(cnode_input) == prim::kPrimReshape->name()) {
+      args_spec_list.emplace_back(cnode_input->abstract());
+    } else {
+      args_spec_list.emplace_back(real_input->abstract());
+    }
+  }
+  auto &prim_eval_implement_map = abstract::GetPrimitiveToEvalImplMap();
+  auto ret = prim_eval_implement_map.find(primitive);
+  if (ret == prim_eval_implement_map.end()) {
+    MS_LOG(EXCEPTION) << "Get infer shape function failed, primitive name:" << primitive->name()
+                      << " primitive type:" << primitive->type_name();
+  }
+  auto eval_result = ret->second.impl_(nullptr, primitive, args_spec_list);
+  node->set_abstract(eval_result);
+}
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@ -230,6 +230,7 @@ class AnfRuntimeAlgorithm {
  static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index);
  static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index);
  static bool IsNodeDynamicShape(const AnfNodePtr &node);
+  static void InferShape(const CNodePtr &node);
  static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
  static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
  // Find control_depend real input nodes.
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@ -65,6 +65,8 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
  auto graph_id = graph_sum_;
  auto graph = ConstructKernelGraph(lst, outputs);
  MS_EXCEPTION_IF_NULL(graph);
+  UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
+  graph->UpdateGraphDynamicAttr();
  MS_LOG(INFO) << "Set kernel info";
  SetKernelInfo(graph.get());
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
@ -87,7 +89,7 @@ void CPUSession::CreateOutputTensors(const GraphId &graph_id, const std::vector<
                                     std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  auto kernel_graph = GetGraph(graph_id);
  MS_EXCEPTION_IF_NULL(kernel_graph);
-  runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs);
+  runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs, tensor_to_node);
 }

 void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@ -47,6 +47,41 @@ static std::shared_ptr<std::map<ValuePtr, ParameterPtr>> python_paras;
 void ClearPythonParasMap() { python_paras = nullptr; }
 namespace {
 const int kSummaryGetItem = 2;
+bool IsUsedByRealKernel(const FuncGraphManagerPtr &manager, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(node);
+  auto node_users = manager->node_users()[node];
+  for (auto item : node_users) {
+    if (AnfAlgo::IsRealKernel(item.first)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsUsedByDynamicKernel(const FuncGraphManagerPtr &manager, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(manager);
+  MS_EXCEPTION_IF_NULL(node);
+  auto node_users = manager->node_users()[node];
+  for (auto item : node_users) {
+    if (item.first->isa<CNode>() && AnfAlgo::IsNodeDynamicShape(item.first->cast<CNodePtr>())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CheckIfNeedCreateOutputTensor(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (node->isa<Parameter>()) {
+    auto node_ptr = node->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(node_ptr);
+    if (!node_ptr->is_used_by_real_kernel()) {
+      return true;
+    }
+  }
+  return false;
+}

 ValuePtr GetParamDefaultValue(const AnfNodePtr &node) {
  if (node == nullptr) {
@ -114,6 +149,8 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair,
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(tensor_to_node);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
  MS_LOG(INFO) << "Create tensor for output[" << node->DebugString() << "] index[" << node_output_pair.second << "]";
  // if node is a value node, no need sync addr from device to host
  if (node->isa<ValueNode>()) {
@ -121,7 +158,8 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair,
    MS_EXCEPTION_IF_NULL(value_node);
    return value_node->value();
  }
-  if (!AnfAlgo::OutputAddrExist(node, output_index)) {
+  if (!AnfAlgo::OutputAddrExist(node, output_index) ||
+      (CheckIfNeedCreateOutputTensor(node) && ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode)) {
    if (node->isa<Parameter>()) {
      for (size_t input_idx = 0; input_idx < graph->inputs().size(); input_idx++) {
        if (input_idx >= input_tensors.size()) {
@ -875,9 +913,21 @@ KernelGraphPtr SessionBasic::ConstructKernelGraph(const AnfNodePtrList &lst, con

  // Update Graph Dynamic Shape Attr
  UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
-
  opt::BackendCommonOptimization(graph);
  graph->SetInputNodes();
+  auto input_nodes = graph->input_nodes();
+  for (auto input_node : input_nodes) {
+    if (input_node->isa<Parameter>()) {
+      auto node_ptr = input_node->cast<ParameterPtr>();
+      MS_EXCEPTION_IF_NULL(node_ptr);
+      if (!IsUsedByRealKernel(manager, input_node)) {
+        node_ptr->set_used_by_real_kernel();
+      }
+      if (IsUsedByDynamicKernel(manager, input_node)) {
+        node_ptr->set_used_by_dynamic_kernel();
+      }
+    }
+  }
  graph->SetOptimizerFlag();
  return graph;
 }
@ -950,7 +1000,22 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphP
      MS_LOG_EXCEPTION << "construct func graph " << func_graph->ToString() << "fail!";
    }
  }
+
  AddParameterToGraphInputs(func_graph->parameters(), graph.get());
+  FuncGraphManagerPtr manager = MakeManager({graph});
+  auto input_nodes = graph->inputs();
+  for (auto input_node : input_nodes) {
+    if (input_node->isa<Parameter>()) {
+      auto node_ptr = input_node->cast<ParameterPtr>();
+      MS_EXCEPTION_IF_NULL(node_ptr);
+      if (!IsUsedByRealKernel(manager, input_node)) {
+        node_ptr->set_used_by_real_kernel();
+      }
+      if (IsUsedByDynamicKernel(manager, input_node)) {
+        node_ptr->set_used_by_dynamic_kernel();
+      }
+    }
+  }
  graph->SetExecOrderByDefault();
  if (ExistSummaryNode(graph.get())) {
    graph->set_summary_node_exist(true);
@ -1021,14 +1086,23 @@ void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_grap
    MS_EXCEPTION_IF_NULL(tensor);
    auto input_node = input_nodes[i];
    MS_EXCEPTION_IF_NULL(input_node);
+    auto size = LongToSize(tensor->data().nbytes());
+    if (input_node->isa<Parameter>() && input_node->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) {
+      auto tensor_shape = tensor->shape();
+      std::vector<size_t> shape_tmp;
+      (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
+      AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp},
+                                          input_node.get());
+      size = trans::ShapeSize(shape_tmp) * trans::TypeIdSize(tensor->data_type());
+    }
    if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) {
      auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
      MS_EXCEPTION_IF_NULL(device_address);
-      if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0),
-                                            LongToSize(tensor->data().nbytes()), tensor->data_type(),
-                                            tensor->data_c())) {
+      if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size,
+                                                         tensor->data_type(), tensor->data_c())) {
        MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
      }
+
      if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
          AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) {
        tensor->set_device_address(device_address);
@ -1543,55 +1617,6 @@ void SessionBasic::RunGraphAsync(const GraphId &graph_id, const std::vector<tens
  executor_->RunGraphAsync(shared_from_this(), graph_id, inputs, outputs);
 }

-bool IsDynamicShape(const NotNull<abstract::ShapePtr> &shape) {
-  return std::any_of(shape->shape().begin(), shape->shape().end(), [](int64_t s) { return s < 0; });
-}
-
-bool IsNodeOutputDynamicShape(const CNodePtr &anf_node_ptr) {
-  MS_EXCEPTION_IF_NULL(anf_node_ptr);
-  return AnfAlgo::IsNodeDynamicShape(anf_node_ptr);
-}
-
-bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
-  MS_EXCEPTION_IF_NULL(anf_node_ptr);
-  auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr);
-  for (size_t i = 0; i < input_num; ++i) {
-    auto input_with_index = AnfAlgo::GetPrevNodeOutput(anf_node_ptr, i);
-    auto input = input_with_index.first;
-    auto index = input_with_index.second;
-    MS_EXCEPTION_IF_NULL(input);
-
-    auto base_shape = input->Shape();
-    if (base_shape == nullptr) {
-      MS_LOG(INFO) << "Invalid shape ptr, node:" << input->fullname_with_scope();
-      continue;
-    }
-    if (base_shape->isa<abstract::Shape>()) {
-      if (IsDynamicShape(NOT_NULL(base_shape->cast<abstract::ShapePtr>()))) {
-        return true;
-      }
-    } else if (base_shape->isa<abstract::TupleShape>()) {
-      auto tuple_shape = base_shape->cast<abstract::TupleShapePtr>();
-      MS_EXCEPTION_IF_NULL(tuple_shape);
-
-      if (index >= tuple_shape->size()) {
-        MS_LOG(INFO) << "Node:" << anf_node_ptr->fullname_with_scope() << "Invalid index:" << index
-                     << " and tuple_shape size:" << tuple_shape->size();
-        continue;
-      }
-
-      auto b_shp = (*tuple_shape)[index];
-      if (!b_shp->isa<abstract::Shape>()) {
-        continue;
-      }
-      if (IsDynamicShape(NOT_NULL(b_shp->cast<abstract::ShapePtr>()))) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 void SessionBasic::UpdateAllGraphDynamicShapeAttr(const std::vector<KernelGraphPtr> &all_graphs) {
  bool is_dynamic = false;
  for (const auto &graph : all_graphs) {
@ -1605,20 +1630,10 @@ void SessionBasic::UpdateAllGraphDynamicShapeAttr(const std::vector<KernelGraphP

 void SessionBasic::UpdateGraphDynamicShapeAttr(const NotNull<KernelGraphPtr> &root_graph) {
  for (const auto &cnode : root_graph->execution_order()) {
-    auto output_dynamic = IsNodeOutputDynamicShape(NOT_NULL(cnode));
-    auto input_dynamic = IsNodeInputDynamicShape(NOT_NULL(cnode));
-    if (output_dynamic || input_dynamic) {
+    if (AnfAlgo::IsNodeDynamicShape(cnode)) {
      AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), cnode);
      MS_LOG(INFO) << "Set Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
    }
-    if (output_dynamic) {
-      AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(true), cnode);
-      MS_LOG(INFO) << "Set Output Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
-    }
-    if (input_dynamic) {
-      AnfAlgo::SetNodeAttr(kAttrInputIsDynamicShape, MakeValue(true), cnode);
-      MS_LOG(INFO) << "Set Input Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
-    }
  }
  root_graph->UpdateGraphDynamicAttr();
 }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@ -532,7 +532,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
  }
  if (format_ == kOpFormat_NCHW || format_ == kOpFormat_DEFAULT || format_ == kOpFormat_NDHWC) {
    if (type_id_ == type) {
-      SyncMemory(ptr_, host_ptr, size_, RT_MEMCPY_HOST_TO_DEVICE);
+      SyncMemory(ptr_, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE);
      sync_ok = true;
    } else if (type_id_ == kNumberTypeFloat32 && type == kNumberTypeFloat64) {
      sync_ok = Float64ToFloatAndSyncHostToDevice(ptr_, size_, host_ptr, size);
--- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
@ -66,11 +66,15 @@ void AiCpuDynamicKernel::Initialize() {
  input_num_ = AnfAlgo::GetInputTensorNum(cnode_ptr_);
  output_num_ = AnfAlgo::GetOutputTensorNum(cnode_ptr_);

+  UnknowShapeOpType shape_type = UnknowShapeOpType::DEPEND_IN_SHAPE;
+  if (AnfAlgo::GetCNodeName(cnode_ptr_) == "Unique") {
+    shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
+  }
  // Parse aicpu ext info
  if (is_dynamic_shape_) {
    MS_EXCEPTION_IF_NULL(cnode_ptr_);
    ext_info_handler_ =
-      std::make_shared<AicpuExtInfoHandler>(cnode_ptr_->fullname_with_scope(), input_num_, output_num_, DEPEND_COMPUTE);
+      std::make_shared<AicpuExtInfoHandler>(cnode_ptr_->fullname_with_scope(), input_num_, output_num_, shape_type);
    ext_info_handler_->Parse(ext_info_data_);
  }

--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@ -19,6 +19,7 @@
 #include <memory>
 #include <numeric>
 #include <utility>
+#include <algorithm>
 #include <functional>
 #include "backend/kernel_compiler/kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@ -129,9 +130,11 @@ DeviceAddressPtr CPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t
  return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id);
 }

-tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_graph, const CNodePtr &node,
-                                                         size_t index) {
+tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(
+  session::KernelGraph *kernel_graph, const CNodePtr &node, size_t index,
+  std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(tensor_to_node);
  size_t output_size = AnfAlgo::GetOutputTensorNum(node);
  if (index >= output_size) {
    MS_LOG(EXCEPTION) << "Invalid input index " << index;
@ -166,13 +169,16 @@ tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *k
    }
    (void)bound_addresses_.insert(address);
  }
+  session::KernelWithIndex node_index(node, index);
  tensor->SetNeedWait(true);
  tensor->SetIsGraphOutput();
+  (*tensor_to_node)[tensor] = node_index;
  return tensor;
 }

 BaseRef CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_graph,
-                                               const session::KernelWithIndex &kernel_with_index) {
+                                               const session::KernelWithIndex &kernel_with_index,
+                                               std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  auto &input_node = kernel_with_index.first;
  auto index = kernel_with_index.second;
  MS_EXCEPTION_IF_NULL(input_node);
@ -183,12 +189,12 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_grap
      VectorRef ret;
      for (size_t i = 1; i < node->inputs().size(); i++) {
        auto item_with_index = AnfAlgo::VisitKernelWithReturnType(node->input(i), 0);
-        auto out = CreatTensorForOutput(kernel_graph, item_with_index);
+        auto out = CreatTensorForOutput(kernel_graph, item_with_index, tensor_to_node);
        ret.push_back(out);
      }
      return ret;
    }
-    return CreatTensorForOutput(kernel_graph, node, index);
+    return CreatTensorForOutput(kernel_graph, node, index, tensor_to_node);
  } else if (input_node->isa<Parameter>()) {
    auto iter = input_param_tensor_map_.find(input_node);
    if (iter != input_param_tensor_map_.end()) {
@ -203,9 +209,11 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_grap
 }

 void CPUKernelRuntime::CreateOutputTensors(session::KernelGraph *kernel_graph,
-                                           const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
+                                           const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs,
+                                           std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  MS_EXCEPTION_IF_NULL(outputs);
+  MS_EXCEPTION_IF_NULL(tensor_to_node);
  auto &input_nodes = kernel_graph->inputs();
  if (input_nodes.size() != inputs.size()) {
    MS_LOG(EXCEPTION) << "Input size not equal to input node size!";
@ -222,7 +230,7 @@ void CPUKernelRuntime::CreateOutputTensors(session::KernelGraph *kernel_graph,
  auto output_nodes = kernel_graph->outputs();
  for (const auto &item : output_nodes) {
    auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true);
-    auto out = CreatTensorForOutput(kernel_graph, item_with_index);
+    auto out = CreatTensorForOutput(kernel_graph, item_with_index, tensor_to_node);
    outputs->push_back(std::move(out));
  }
 }
@ -258,6 +266,12 @@ void CPUKernelRuntime::BindInputTensorAddressPtr(const session::KernelGraph &ker
          MS_LOG(EXCEPTION) << "Parameter node sync host to device failed!";
        }
      }
+      if (item->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) {
+        auto tensor_shape = tensor->shape();
+        std::vector<size_t> shape_tmp;
+        (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
+        AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(item, 0)}, {shape_tmp}, item.get());
+      }
      address->ref_count_ = INIT_NODE_REF;
      tensor->set_device_address(address);
    }
@ -325,6 +339,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink
 #ifdef ENABLE_PROFILE
    double start_time = GetTime();
 #endif
+    if (AnfAlgo::IsDynamicShape(kernel)) {
+      AnfAlgo::InferShape(kernel);
+    }
    std::vector<kernel::AddressPtr> kernel_inputs;
    std::vector<kernel::AddressPtr> kernel_workspaces;
    std::vector<kernel::AddressPtr> kernel_outputs;
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@ -39,7 +39,7 @@ class CPUKernelRuntime : public KernelRuntime {
  bool Run(session::KernelGraph *graph, bool is_task_sink) override;
  void AssignKernelAddress(session::KernelGraph *kernel_graph);
  void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
-                           VectorRef *outputs);
+                           VectorRef *outputs, std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
  void BindInputOutput(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                       VectorRef *outputs);
  void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
@ -53,8 +53,10 @@ class CPUKernelRuntime : public KernelRuntime {
                                       TypeId type_id) override;

 private:
-  tensor::TensorPtr CreatTensorForOutput(session::KernelGraph *kernel_graph, const CNodePtr &node, size_t index);
-  BaseRef CreatTensorForOutput(session::KernelGraph *kernel_graph, const session::KernelWithIndex &kernel_with_index);
+  tensor::TensorPtr CreatTensorForOutput(session::KernelGraph *kernel_graph, const CNodePtr &node, size_t index,
+                                         std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
+  BaseRef CreatTensorForOutput(session::KernelGraph *kernel_graph, const session::KernelWithIndex &kernel_with_index,
+                               std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
  void BindInputTensorAddressPtr(const session::KernelGraph &graph, const std::vector<tensor::TensorPtr> &inputs);
  void BindOutputTensorAddressPtr(const VectorRef *outputs);
  void AssignValueNodeAddress(session::KernelGraph *kernel_graph);
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
@ -51,17 +51,6 @@ void DynamicKernel::Initialize() {

 int DynamicKernel::GetKernelType() { return AnfAlgo::GetKernelType(cnode_ptr_); }

-bool IsTupleGetItem(const AnfNodePtr &anf_node) {
-  MS_EXCEPTION_IF_NULL(anf_node);
-  if (!anf_node->isa<CNode>()) {
-    return false;
-  }
-  auto cnode = anf_node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(cnode);
-  auto input0 = cnode->input(0);
-  return IsPrimitive(input0, prim::kPrimTupleGetItem);
-}
-
 void DynamicKernel::RebuildDependTensor() {
  depend_tensor_map_.clear();
  for (auto depend : depend_list_) {
@ -112,7 +101,7 @@ void DynamicKernel::InferShape() {

    auto cnode_input = cnode_ptr_->input(i + 1);
    MS_EXCEPTION_IF_NULL(cnode_input);
-    if (IsTupleGetItem(cnode_input)) {
+    if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) {
      auto base_shape = real_input->Shape();
      if (!base_shape->isa<abstract::TupleShape>()) {
        MS_LOG(EXCEPTION) << "Node:" << cnode_ptr_->fullname_with_scope()
--- a/mindspore/core/abstract/abstract_value.h
+++ b/mindspore/core/abstract/abstract_value.h
@ -259,6 +259,13 @@ class AbstractUndetermined : public AbstractBase {
    }
    set_shape(std::make_shared<Shape>(shape));
  }
+  explicit AbstractUndetermined(const TypePtr &element_type, const BaseShapePtr &shape = std::make_shared<Shape>())
+      : AbstractBase(kAnyValue), element_(std::make_shared<AbstractScalar>(kAnyValue, element_type)) {
+    if (element_type == nullptr) {
+      MS_LOG(EXCEPTION) << "element_type is nullptr";
+    }
+    set_shape(shape);
+  }
  ~AbstractUndetermined() override = default;
  MS_DECLARE_PARENT(AbstractUndetermined, AbstractBase)
  TypePtr BuildType() const override { return std::make_shared<UndeterminedType>(); }
@ -277,6 +284,8 @@ class AbstractTensor : public AbstractUndetermined {
      : AbstractUndetermined(element, shape) {}
  AbstractTensor(const TypePtr &element_type, const ShapeVector &shape) : AbstractUndetermined(element_type, shape) {}
  explicit AbstractTensor(const tensor::TensorPtr &tensor) : AbstractUndetermined(tensor->Dtype(), tensor->shape()) {}
+  explicit AbstractTensor(const TypePtr &element_type, const BaseShapePtr &shape = std::make_shared<Shape>())
+      : AbstractUndetermined(element_type, shape) {}
  ~AbstractTensor() override = default;
  MS_DECLARE_PARENT(AbstractTensor, AbstractUndetermined)

--- a/mindspore/core/abstract/utils.cc
+++ b/mindspore/core/abstract/utils.cc
@ -26,6 +26,12 @@

 namespace mindspore {
 namespace abstract {
+const std::map<TypeId, size_t> type_map = {{kNumberTypeBool, 1},    {kNumberTypeInt, 4},     {kNumberTypeInt8, 1},
+                                           {kNumberTypeInt16, 2},   {kNumberTypeInt32, 4},   {kNumberTypeInt64, 8},
+                                           {kNumberTypeUInt, 4},    {kNumberTypeUInt8, 1},   {kNumberTypeUInt16, 2},
+                                           {kNumberTypeUInt32, 4},  {kNumberTypeUInt64, 8},  {kNumberTypeFloat, 4},
+                                           {kNumberTypeFloat16, 2}, {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}};
+
 ValuePtr ValueJoin(const ValuePtr &value1, const ValuePtr &value2) {
  MS_EXCEPTION_IF_NULL(value1);
  MS_EXCEPTION_IF_NULL(value2);
@ -291,5 +297,18 @@ ShapePtr GetBroadcastShape(const std::string &op, const AbstractTensorPtr &tenso
  auto y_shape = tensor_y_shape->shape();
  return std::make_shared<Shape>(RealBroadcast(op, x_shape, y_shape));
 }
+
+size_t TypeIdSize(const TypeId data_type) {
+  const size_t unsupported_type_error = 0;
+  auto iter = type_map.find(data_type);
+  if (iter != type_map.end()) {
+    return iter->second;
+  }
+  return unsupported_type_error;
+}
+
+size_t ShapeSize(const std::vector<size_t> &shape) {
+  return std::accumulate(shape.begin(), shape.end(), IntToSize(1), std::multiplies<size_t>());
+}
 }  // namespace abstract
 }  // namespace mindspore
--- a/mindspore/core/abstract/utils.h
+++ b/mindspore/core/abstract/utils.h
@ -51,6 +51,9 @@ int64_t GetPositiveAxis(int64_t axis_value, size_t increment);

 ShapeVector BroadcastShape(ShapeVector shpx, ShapeVector shpy);

+size_t TypeIdSize(const TypeId data_type);
+size_t ShapeSize(const std::vector<size_t> &shape);
+
 // Get broadcasted shape for binary element-wise operation
 ShapePtr GetBroadcastShape(const std::string &op, const AbstractTensorPtr &tensor_x, const AbstractTensorPtr &tensor_y);
 }  // namespace abstract
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
@ -322,9 +322,17 @@ class Parameter : public ANode {
    return shared_from_this() == other.shared_from_this();
  }

+  void set_used_by_real_kernel() { is_real_kernel_used_ = false; }
+  bool is_used_by_real_kernel() { return is_real_kernel_used_; }
+
+  void set_used_by_dynamic_kernel() { is_used_by_dynamic_kernel_ = true; }
+  bool is_used_by_dynamic_kernel() { return is_used_by_dynamic_kernel_; }
+
 private:
  std::string name_;
  bool has_default_;
+  bool is_real_kernel_used_ = true;
+  bool is_used_by_dynamic_kernel_ = false;
  ValuePtr default_param_;
  // The count of graphs using the parameter.
  int used_graph_count_;
--- a/mindspore/core/ir/tensor.cc
+++ b/mindspore/core/ir/tensor.cc
@ -29,6 +29,7 @@
 #include <type_traits>
 #include <typeinfo>

+#include "abstract/utils.h"
 #include "abstract/abstract_value.h"

 namespace mindspore {
@ -581,8 +582,11 @@ void Tensor::data_sync(bool need_wait) const {
  if (device_sync_ == nullptr) {
    return;
  }
+  std::vector<size_t> shape_tmp;
+  (void)std::transform(shape().begin(), shape().end(), std::back_inserter(shape_tmp), IntToSize);
+  auto size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(data_type());
  auto address = device_sync_;
-  if (!address->SyncDeviceToHost(shape(), static_cast<size_t>(data().nbytes()), data_type(), data_c())) {
+  if (size != 0 && !address->SyncDeviceToHost(shape(), size, data_type(), data_c())) {
    MS_LOG(EXCEPTION) << "SyncDeviceToHost failed.";
  }
  sync_status_ = kNeedSyncHostToDevice;
--- a/tests/st/dynamic_shape/test_ascend_cpu.py
+++ b/tests/st/dynamic_shape/test_ascend_cpu.py
@ -0,0 +1,70 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+import mindspore.common.dtype as mstype
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.unique = P.Unique().add_prim_attr("primitive_target", "CPU")
+
+    def construct(self, x):
+        x, y = self.unique(x)
+        return (x, y)
+
+
+class UniqueSquare(nn.Cell):
+    def __init__(self):
+        super(UniqueSquare, self).__init__()
+        self.unique = P.Unique().add_prim_attr("primitive_target", "CPU")
+        self.square = P.Square()
+
+    def construct(self, x):
+        x, _ = self.unique(x)
+        return self.square(x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_unique_ascend():
+    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
+    unique = Net()
+    output = unique(x)
+    expect1 = np.array([1, 2, 3])
+    expect2 = np.array([0, 0, 1, 1, 2, 2])
+    assert (output[0].asnumpy() == expect1).all()
+    assert (output[1].asnumpy() == expect2).all()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_unique_square():
+    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
+    net = UniqueSquare()
+    output = net(x)
+    expect1 = np.array([1, 4, 9])
+    assert (output.asnumpy() == expect1).all()
--- a/tests/st/dynamic_shape/test_unique_cpu.py
+++ b/tests/st/dynamic_shape/test_unique_cpu.py
@ -0,0 +1,69 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+import mindspore.common.dtype as mstype
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.unique = P.Unique()
+
+    def construct(self, x):
+        return self.unique(x)
+
+
+class UniqueSquare(nn.Cell):
+    def __init__(self):
+        super(UniqueSquare, self).__init__()
+        self.unique = P.Unique()
+        self.square = P.Square()
+
+    def construct(self, x):
+        x, _ = self.unique(x)
+        return self.square(x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_unique_cpu():
+    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
+    unique = Net()
+    output = unique(x)
+    expect1 = np.array([1, 2, 3])
+    expect2 = np.array([0, 0, 1, 1, 2, 2])
+    assert (output[0].asnumpy() == expect1).all()
+    assert (output[1].asnumpy() == expect2).all()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_unique_square():
+    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
+    net = UniqueSquare()
+    output = net(x)
+    expect1 = np.array([1, 4, 9])
+    assert (output.asnumpy() == expect1).all()