From 24b8bc1cba8766e53a5fc2bba7c912872ed10601 Mon Sep 17 00:00:00 2001
From: yanghaoran <yanghaoran2@huawei.com>
Date: Thu, 29 Oct 2020 14:40:04 +0800
Subject: [PATCH] synchronize latest Ascend software suite 27 Oct 2020

---
 inc/common/opskernel/ops_kernel_builder.h     |  51 ++
 inc/common/opskernel/ops_kernel_info_store.h  |  17 +-
 inc/common/opskernel/ops_kernel_info_types.h  |   8 +-
 inc/common/optimizer/graph_optimizer.h        |   2 -
 .../aicore_manager/aicore_util_manager.h      |  48 +
 .../ai_core/common/aicore_util_attr_define.h  |  10 +-
 .../ai_core/common/aicore_util_constants.h    |  54 ++
 .../util/ai_core/common/aicore_util_types.h   |  66 +-
 inc/common/util/ai_core/common/graph_comm.h   |  54 +-
 inc/common/util/ai_core/common/json_util.h    |  54 ++
 .../util/ai_core/common/l2_stream_info.h      |  44 +
 .../util/ai_core/common/scope_allocator.h     |   8 +-
 .../param_calculate/tensorsize_calculator.h   |  10 +-
 inc/common/util/error_manager/error_manager.h |   2 +
 inc/common/util/platform_info.h               |  54 +-
 inc/common/util/platform_info_def.h           | 172 ++--
 inc/external/ge/ge_api_error_codes.h          |   2 +-
 inc/external/ge/ge_ir_build.h                 |  21 +
 inc/external/graph/ascend_string.h            |  38 +
 inc/external/graph/attr_value.h               |   2 -
 inc/external/graph/ge_error_codes.h           |   1 +
 inc/external/graph/gnode.h                    | 129 +++
 inc/external/graph/graph.h                    |  29 +-
 inc/external/graph/operator.h                 |  23 +-
 inc/external/graph/tensor.h                   |   1 -
 inc/external/hccl/hccl.h                      | 134 +++
 inc/external/hccl/hccl_types.h                | 101 +++
 inc/external/register/register.h              |   2 -
 .../scope/scope_fusion_pass_register.h        |   1 -
 inc/framework/common/ge_inner_error_codes.h   |   2 +-
 inc/framework/common/op/attr_value_util.h     |   1 -
 inc/framework/common/op/ge_op_utils.h         |   3 +-
 inc/framework/common/string_util.h            |   8 +-
 inc/framework/common/types.h                  |   1 +
 inc/framework/common/util.h                   |   2 +-
 inc/framework/engine/dnnengine.h              |   1 +
 inc/framework/generator/ge_generator.h        |   1 +
 inc/framework/memory/memory_api.h             |  14 +
 inc/framework/memory/memory_assigner.h        |   2 +-
 inc/framework/omg/omg.h                       |   3 -
 inc/framework/omg/omg_inner_types.h           |   5 +-
 inc/graph/buffer.h                            |   6 +-
 inc/graph/compute_graph.h                     |   2 -
 inc/graph/debug/ge_attr_define.h              |  16 +-
 inc/graph/detail/any_map.h                    |   2 +-
 inc/graph/detail/attributes_holder.h          |   4 +-
 inc/graph/ge_attr_value.h                     |   6 +-
 inc/graph/ge_context.h                        |   2 +-
 inc/graph/ge_local_context.h                  |   5 +
 inc/graph/node.h                              |   2 +-
 inc/graph/range_vistor.h                      |   4 -
 inc/graph/utils/graph_utils.h                 |  44 +-
 inc/graph/utils/node_adapter.h                |  32 +
 inc/graph/utils/node_utils.h                  |   8 +
 .../common/graph/ascend_string.cc             |  28 +-
 src/common/graph/format_refiner.cc            |  33 +-
 src/common/graph/ge_attr_define.cc            |  15 +-
 src/common/graph/ge_attr_value.cc             |  29 +-
 src/common/graph/gnode.cc                     | 857 ++++++++++++++++++
 src/common/graph/graph.cc                     | 235 ++++-
 src/common/graph/graph.mk                     |  15 +-
 src/common/graph/model.cc                     |   1 +
 src/common/graph/model_serialize.cc           |   4 +-
 src/common/graph/op_desc.cc                   |   1 -
 src/common/graph/operator.cc                  |  22 +-
 src/common/graph/opsproto/opsproto_manager.cc |   2 -
 src/common/graph/option/ge_context.cc         |   2 +
 src/common/graph/option/ge_local_context.cc   |  14 +
 src/common/graph/shape_refiner.cc             |  91 +-
 src/common/graph/stub/Makefile                |   6 -
 src/common/graph/stub/gen_stubapi.py          | 578 ------------
 src/common/graph/tensor.cc                    |  24 +-
 src/common/graph/utils/graph_utils.cc         | 333 ++++++-
 src/common/graph/utils/node_utils.cc          |  37 +
 src/common/graph/utils/op_desc_utils.cc       |  10 +-
 src/common/graph/utils/tuning_utils.cc        |  21 +-
 src/ge/CMakeLists.txt                         |   9 +-
 src/ge/analyzer/analyzer.cc                   |  61 +-
 src/ge/analyzer/analyzer.h                    |   9 +-
 src/ge/client/ge_prof.cc                      |  23 +-
 src/ge/client/module.mk                       |   6 +-
 src/ge/common/auth/file_saver.cc              |  22 +-
 src/ge/common/convert/pb2json.cc              | 248 -----
 src/ge/common/convert/pb2json.h               |  68 --
 src/ge/common/dump/dump_properties.cc         |   2 +-
 src/ge/common/dump/dump_properties.h          |   2 +-
 src/ge/common/ge/tbe_plugin_manager.cc        |  36 -
 src/ge/common/ge/tbe_plugin_manager.h         |   1 -
 src/ge/common/ge_common.mk                    |   5 +-
 src/ge/common/helper/model_cache_helper.cc    |   1 -
 src/ge/common/op/attr_value_util.cc           |   1 +
 src/ge/common/op/ge_op_utils.cc               |   1 +
 src/ge/common/profiling/profiling_manager.cc  |  35 +-
 src/ge/common/types.cc                        |   1 +
 src/ge/common/util.cc                         |   7 +-
 src/ge/engine_manager/dnnengine_manager.cc    |   6 +-
 src/ge/engine_manager/engine_conf.json        |   7 +
 src/ge/executor/ge_executor.cc                |  42 +-
 src/ge/executor/module.mk                     |  15 +-
 src/ge/ge_inference.mk                        |   5 +
 .../ge_local_engine/engine/host_cpu_engine.cc |  52 +-
 src/ge/ge_local_engine/module.mk              |  88 ++
 .../ge_local_ops_kernel_builder.cc            | 174 ++++
 .../ge_local_ops_kernel_builder.h             |  48 +
 .../ge_local_ops_kernel_info.cc               | 133 ---
 .../ge_local_ops_kernel_info.h                |  24 -
 .../ge_local_engine/ops_kernel_store/op/op.h  |   2 +-
 src/ge/ge_runner.mk                           |   8 +-
 src/ge/generator/ge_generator.cc              |  37 +-
 src/ge/generator/generator_api.cc             |   2 +-
 src/ge/graph/build/graph_builder.cc           | 156 +++-
 .../graph/build/memory/block_mem_assigner.cc  | 230 ++++-
 .../graph/build/memory/block_mem_assigner.h   |  67 +-
 .../graph/build/memory/graph_mem_assigner.cc  | 330 +++++--
 .../graph/build/memory/graph_mem_assigner.h   |  19 +-
 .../graph/build/memory/hybrid_mem_assigner.cc |   3 +-
 .../graph/build/memory/hybrid_mem_assigner.h  |   2 +
 src/ge/graph/build/memory/memory_assigner.cc  |   2 +-
 src/ge/graph/build/memory/module.mk           |   1 +
 .../graph/build/memory/var_mem_assign_util.h  |   2 +-
 src/ge/graph/build/model_builder.cc           |  20 +-
 src/ge/graph/build/model_builder.h            |   3 +-
 src/ge/graph/build/run_context.cc             |  41 +-
 src/ge/graph/build/run_context.h              |   8 +-
 src/ge/graph/build/task_generator.cc          |  14 +-
 src/ge/graph/common/ge_call_wrapper.h         |  20 +-
 src/ge/graph/label/label_maker_factory.h      |   7 +-
 .../load/new_model_manager/data_dumper.cc     |  95 +-
 .../load/new_model_manager/data_dumper.h      |  10 +-
 .../load/new_model_manager/davinci_model.cc   | 194 +++-
 .../load/new_model_manager/davinci_model.h    |  35 +-
 .../load/new_model_manager/model_utils.cc     | 101 ++-
 .../task_info/kernel_ex_task_info.cc          |   4 +
 .../task_info/kernel_task_info.cc             |   7 +
 .../label_switch_by_index_task_info.cc        |   4 +-
 .../task_info/memcpy_addr_async_task_info.cc  |  15 +-
 .../task_info/memcpy_async_task_info.cc       |  77 +-
 .../task_info/memcpy_async_task_info.h        |  14 +-
 .../task_info/stream_switchn_task_info.cc     |  33 +-
 .../new_model_manager/task_info/task_info.h   |  18 +
 .../load/new_model_manager/ts_mem_mall.h      | 104 +++
 .../new_model_manager/zero_copy_offset.cc     |   6 +-
 .../load/new_model_manager/zero_copy_offset.h |   2 +-
 src/ge/graph/manager/graph_manager.cc         | 292 ++++--
 src/ge/graph/manager/graph_manager.h          |  51 +-
 src/ge/graph/manager/host_mem_manager.cc      |  78 +-
 src/ge/graph/manager/host_mem_manager.h       |  34 +-
 src/ge/graph/manager/memory_api.cc            |  71 +-
 src/ge/graph/manager/rdma_pool_allocator.cc   |   2 +-
 src/ge/graph/manager/util/hcom_util.h         |  13 +-
 .../manager/util/variable_accelerate_ctrl.cc  |   8 +
 .../manager/util/variable_accelerate_ctrl.h   |   3 +
 src/ge/graph/optimize/graph_optimize.cc       |   9 +-
 src/ge/graph/optimize/graph_optimize.h        |   6 +-
 .../optimize/mem_rw_conflict_optimize.cc      |   2 +-
 .../partition/dynamic_shape_partition.cc      |  21 +-
 .../graph/partition/dynamic_shape_partition.h |   1 +
 src/ge/graph/partition/graph_partition.cc     |  48 +-
 src/ge/graph/partition/graph_partition.h      |   2 +-
 src/ge/graph/partition/stage_partition.cc     | 376 ++++++++
 src/ge/graph/partition/stage_partition.h      |  67 ++
 .../passes/aicpu_constant_folding_pass.cc     |  15 +-
 src/ge/graph/passes/atomic_addr_clean_pass.cc |   4 +-
 src/ge/graph/passes/compile_nodes_pass.cc     |   2 +-
 src/ge/graph/passes/constant_folding_pass.cc  |  43 +-
 .../graph/passes/ctrl_edge_transfer_pass.cc   |   7 +
 src/ge/graph/passes/infershape_pass.cc        |   6 +-
 .../passes/mark_graph_unknown_status_pass.cc  |   7 +
 src/ge/graph/passes/multi_batch_clone_pass.cc |  61 +-
 src/ge/graph/passes/multi_batch_clone_pass.h  |   5 +-
 src/ge/graph/passes/multi_batch_pass.cc       |  32 +
 src/ge/graph/passes/multi_batch_pass.h        |   9 +
 .../passes/subexpression_migration_pass.cc    |  12 +-
 .../passes/subgraph_const_migration_pass.cc   | 570 ++++++++++++
 .../passes/subgraph_const_migration_pass.h    | 138 +++
 src/ge/graph/passes/unused_args_clean_pass.cc |   4 +
 .../graph/preprocess/insert_op/ge_aipp_op.cc  |  12 +-
 .../insert_op/util_insert_aipp_op.cc          |   1 -
 .../preprocess/multi_batch_copy_graph.cc      | 131 ++-
 .../graph/preprocess/multi_batch_copy_graph.h |   2 +-
 .../graph/preprocess/multi_batch_options.cc   |  63 ++
 src/ge/graph/preprocess/multi_batch_options.h |  21 +
 src/ge/host_cpu_engine/CMakeLists.txt         | 209 +++++
 src/ge/host_cpu_engine/module.mk              |  81 ++
 .../host_cpu_ops_kernel_builder.cc            |  98 ++
 .../host_cpu_ops_kernel_builder.h             |  37 +
 .../host_cpu_ops_kernel_info.cc               |  66 --
 .../host_cpu_ops_kernel_info.h                |  16 -
 src/ge/host_cpu_engine/proto/task.proto       | 171 +++-
 src/ge/host_kernels/concat_v2_kernel.cc       |   2 +-
 src/ge/host_kernels/fill_kernel.cc            |   1 +
 src/ge/host_kernels/identity_kernel.cc        |   1 +
 src/ge/host_kernels/pack_kernel.cc            |   1 +
 src/ge/host_kernels/rank_kernel.cc            |   1 +
 src/ge/host_kernels/rsqrt_kernel.cc           |   1 +
 src/ge/host_kernels/shape_kernel.cc           |   1 +
 src/ge/host_kernels/shape_n_kernel.cc         |   1 +
 src/ge/host_kernels/strided_slice_kernel.cc   | 101 ++-
 src/ge/host_kernels/strided_slice_kernel.h    |   6 +-
 src/ge/hybrid/common/npu_memory_allocator.cc  |   2 +-
 .../executor/hybrid_model_async_executor.cc   |   6 +-
 .../hybrid/executor/hybrid_model_executor.cc  |   2 +-
 src/ge/hybrid/executor/rt_callback_manager.cc |   9 +-
 src/ge/hybrid/executor/subgraph_executor.cc   |   3 +
 .../executor/worker/execution_engine.cc       | 133 ++-
 .../executor/worker/shape_inference_engine.cc |   7 +-
 src/ge/hybrid/model/hybrid_model.cc           |  17 +-
 src/ge/hybrid/model/hybrid_model.h            |   3 +-
 src/ge/hybrid/model/hybrid_model_builder.cc   | 127 ++-
 src/ge/hybrid/model/node_item.cc              |   9 +-
 .../aicore/aicore_node_executor.cc            |  24 +-
 .../node_executor/aicore/aicore_op_task.cc    |   2 +
 .../node_executor/aicore/aicore_op_task.h     |   3 +
 .../aicore/aicore_task_compiler.cc            |  22 +-
 .../aicore/aicore_task_compiler.h             |   6 +-
 .../aicpu/aicpu_node_executor.cc              |  14 +-
 .../node_executor/hccl/hccl_node_executor.cc  |  22 +-
 .../node_executor/hccl/hccl_node_executor.h   |   1 +
 .../host_cpu/host_cpu_node_executor.cc        |   6 +-
 src/ge/hybrid/node_executor/node_executor.cc  |  33 +-
 src/ge/hybrid/node_executor/node_executor.h   |   3 +-
 src/ge/hybrid/node_executor/task_context.cc   |   2 +-
 src/ge/inc/graph_pass.h                       |   1 +
 src/ge/init/gelib.cc                          |  47 +
 src/ge/init/gelib.h                           |   1 +
 src/ge/ir_build/atc_ir_common.cc              |  45 +-
 src/ge/ir_build/atc_ir_common.h               |   1 +
 src/ge/ir_build/ge_ir_build.cc                |  96 +-
 src/ge/model/ge_model.cc                      |   1 +
 .../ops_kernel_builder_manager.cc             | 153 ++++
 .../ops_kernel_builder_manager.h              |  56 ++
 .../opskernel_manager/ops_kernel_manager.cc   |   4 +-
 .../optimizer_priority.pbtxt                  |   2 +-
 src/ge/plugin/engine/dnnengines.cc            |  18 +-
 src/ge/plugin/engine/dnnengines.h             |  15 +
 src/ge/plugin/engine/engine_manage.cc         |  20 +-
 src/ge/session/inner_session.cc               |  24 +-
 src/ge/session/inner_session.h                |   2 +
 src/ge/session/omg.cc                         | 227 ++---
 src/ge/single_op/task/aicpu_task_builder.cc   |  10 +-
 src/ge/stub/Makefile                          |   6 -
 src/ge/stub/README                            |   4 -
 src/ge/stub/README.md                         |  44 -
 src/ge/stub/gen_stubapi.py                    | 578 ------------
 third_party/fwkacllib/inc/hccl/base.h         |  10 +
 third_party/fwkacllib/inc/hccl/hccl_types.h   |  99 --
 third_party/fwkacllib/inc/hccl/hcom.h         |   9 +
 third_party/fwkacllib/inc/mmpa/mmpa_api.h     |  18 +-
 .../fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h   |  69 ++
 .../inc/mmpa/sub_inc/mmpa_typedef_linux.h     |   3 +
 .../inc/mmpa/sub_inc/mmpa_typedef_win.h       |   4 +
 .../fwkacllib/inc/mmpa/sub_inc/mmpa_win.h     |  82 +-
 third_party/fwkacllib/inc/ops/aipp.h          |  21 +-
 third_party/fwkacllib/inc/ops/all_ops.h       |   6 +-
 third_party/fwkacllib/inc/ops/array_ops.h     |  39 +-
 third_party/fwkacllib/inc/ops/audio_ops.h     |   6 +-
 third_party/fwkacllib/inc/ops/batch_ops.h     |   6 +-
 third_party/fwkacllib/inc/ops/bitwise_ops.h   |   6 +-
 .../fwkacllib/inc/ops/boosted_trees_ops.h     |   6 +-
 .../inc/ops/candidate_sampling_ops.h          |   6 +-
 third_party/fwkacllib/inc/ops/condtake_ops.h  |   6 +-
 .../fwkacllib/inc/ops/control_flow_ops.h      |   6 +-
 third_party/fwkacllib/inc/ops/ctc_ops.h       |   6 +-
 third_party/fwkacllib/inc/ops/data_flow_ops.h |   6 +-
 .../inc/ops/elewise_calculation_ops.h         | 161 ++--
 .../fwkacllib/inc/ops/functional_ops.h        |  26 +-
 third_party/fwkacllib/inc/ops/get_data_ops.h  |   6 +-
 third_party/fwkacllib/inc/ops/hcom_ops.h      |  43 +-
 third_party/fwkacllib/inc/ops/hvd_ops.h       |   6 +-
 third_party/fwkacllib/inc/ops/image_ops.h     |  13 +-
 third_party/fwkacllib/inc/ops/internal_ops.h  |   6 +-
 third_party/fwkacllib/inc/ops/linalg_ops.h    |   6 +-
 third_party/fwkacllib/inc/ops/logging_ops.h   |   6 +-
 third_party/fwkacllib/inc/ops/lookup_ops.h    |   6 +-
 third_party/fwkacllib/inc/ops/math_ops.h      | 154 +++-
 .../inc/ops/matrix_calculation_ops.h          |  40 +-
 .../fwkacllib/inc/ops/nn_batch_norm_ops.h     | 166 ++--
 .../fwkacllib/inc/ops/nn_calculation_ops.h    | 158 +++-
 third_party/fwkacllib/inc/ops/nn_detect_ops.h | 125 +--
 third_party/fwkacllib/inc/ops/nn_norm_ops.h   |  56 +-
 third_party/fwkacllib/inc/ops/nn_ops.h        |   6 +-
 .../fwkacllib/inc/ops/nn_pooling_ops.h        | 259 +++++-
 .../fwkacllib/inc/ops/nn_training_ops.h       |  45 +-
 third_party/fwkacllib/inc/ops/no_op.h         |   6 +-
 .../fwkacllib/inc/ops/nonlinear_fuc_ops.h     |   6 +-
 .../fwkacllib/inc/ops/npu_loss_scale_ops.h    |   7 +-
 third_party/fwkacllib/inc/ops/outfeed_ops.h   |   6 +-
 third_party/fwkacllib/inc/ops/pad_ops.h       |  77 +-
 third_party/fwkacllib/inc/ops/parsing_ops.h   |   6 +-
 third_party/fwkacllib/inc/ops/quantize_ops.h  |   6 +-
 .../fwkacllib/inc/ops/ragged_array_ops.h      |   6 +-
 .../fwkacllib/inc/ops/ragged_conversion_ops.h |  10 +-
 .../fwkacllib/inc/ops/ragged_math_ops.h       |   6 +-
 third_party/fwkacllib/inc/ops/random_ops.h    |  28 +-
 third_party/fwkacllib/inc/ops/reduce_ops.h    |  30 +-
 .../fwkacllib/inc/ops/resource_variable_ops.h |   6 +-
 third_party/fwkacllib/inc/ops/rnn.h           | 123 ++-
 third_party/fwkacllib/inc/ops/rpn_ops.h       |   6 +-
 third_party/fwkacllib/inc/ops/save_ops.h      |   8 +-
 third_party/fwkacllib/inc/ops/sdca_ops.h      |  19 +-
 third_party/fwkacllib/inc/ops/selection_ops.h |  69 +-
 third_party/fwkacllib/inc/ops/set_ops.h       |   6 +-
 third_party/fwkacllib/inc/ops/sparse_ops.h    |   6 +-
 third_party/fwkacllib/inc/ops/spectral_ops.h  |   6 +-
 .../fwkacllib/inc/ops/split_combination_ops.h |  21 +-
 third_party/fwkacllib/inc/ops/state_ops.h     |   6 +-
 .../fwkacllib/inc/ops/stateful_random_ops.h   |   6 +-
 .../fwkacllib/inc/ops/stateless_random_ops.h  |   6 +-
 third_party/fwkacllib/inc/ops/string_ops.h    |   6 +-
 third_party/fwkacllib/inc/ops/swap_co_ops.h   |   6 +-
 .../fwkacllib/inc/ops/transformation_ops.h    |  32 +-
 .../fwkacllib/inc/ops/warp_perspective_ops.h  |   6 +-
 .../inc/register/op_kernel_registry.h         |   1 -
 .../fwkacllib/inc/register/op_tiling.h        |   1 +
 .../register/ops_kernel_builder_registry.h    |  67 ++
 third_party/fwkacllib/inc/runtime/base.h      |  23 +
 third_party/fwkacllib/inc/runtime/context.h   |   7 +
 third_party/fwkacllib/inc/runtime/dev.h       |  17 +
 third_party/fwkacllib/inc/runtime/kernel.h    |  12 +
 third_party/fwkacllib/inc/runtime/mem.h       |  46 +-
 third_party/fwkacllib/inc/runtime/rt_model.h  |  19 +
 .../fwkacllib/inc/tdt/tdt_host_interface.h    |  87 ++
 .../fwkacllib/inc/toolchain/prof_acl_api.h    |   9 +
 third_party/fwkacllib/inc/toolchain/slog.h    |  26 +-
 324 files changed, 9808 insertions(+), 4072 deletions(-)
 create mode 100644 inc/common/opskernel/ops_kernel_builder.h
 create mode 100644 inc/common/util/ai_core/aicore_manager/aicore_util_manager.h
 create mode 100644 inc/common/util/ai_core/common/aicore_util_constants.h
 create mode 100644 inc/common/util/ai_core/common/json_util.h
 create mode 100644 inc/common/util/ai_core/common/l2_stream_info.h
 create mode 100644 inc/external/graph/ascend_string.h
 create mode 100644 inc/external/graph/gnode.h
 create mode 100644 inc/external/hccl/hccl.h
 create mode 100644 inc/external/hccl/hccl_types.h
 create mode 100644 inc/graph/utils/node_adapter.h
 rename inc/common/util/ai_core/param_calculate/aicore_param_calculator.h => src/common/graph/ascend_string.cc (62%)
 create mode 100644 src/common/graph/gnode.cc
 delete mode 100644 src/common/graph/stub/Makefile
 delete mode 100644 src/common/graph/stub/gen_stubapi.py
 delete mode 100644 src/ge/common/convert/pb2json.cc
 delete mode 100644 src/ge/common/convert/pb2json.h
 create mode 100644 src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.cc
 create mode 100644 src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.h
 create mode 100644 src/ge/graph/load/new_model_manager/ts_mem_mall.h
 create mode 100644 src/ge/graph/partition/stage_partition.cc
 create mode 100644 src/ge/graph/partition/stage_partition.h
 create mode 100644 src/ge/graph/passes/subgraph_const_migration_pass.cc
 create mode 100644 src/ge/graph/passes/subgraph_const_migration_pass.h
 create mode 100644 src/ge/host_cpu_engine/CMakeLists.txt
 create mode 100644 src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc
 create mode 100644 src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.h
 mode change 120000 => 100644 src/ge/host_cpu_engine/proto/task.proto
 create mode 100644 src/ge/opskernel_manager/ops_kernel_builder_manager.cc
 create mode 100644 src/ge/opskernel_manager/ops_kernel_builder_manager.h
 delete mode 100644 src/ge/stub/Makefile
 delete mode 100644 src/ge/stub/README
 delete mode 100755 src/ge/stub/README.md
 delete mode 100644 src/ge/stub/gen_stubapi.py
 delete mode 100644 third_party/fwkacllib/inc/hccl/hccl_types.h
 create mode 100644 third_party/fwkacllib/inc/register/ops_kernel_builder_registry.h

diff --git a/inc/common/opskernel/ops_kernel_builder.h b/inc/common/opskernel/ops_kernel_builder.h
new file mode 100644
index 00000000..c55dea9f
--- /dev/null
+++ b/inc/common/opskernel/ops_kernel_builder.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_COMMON_OPSKERNELUTILS_OPS_KERNEL_INFO_UTILS_H_
+#define INC_COMMON_OPSKERNELUTILS_OPS_KERNEL_INFO_UTILS_H_
+
+#include "external/ge/ge_api_error_codes.h"
+#include "cce/aicpu_engine_struct.h"
+#include "common/opskernel/ops_kernel_info_types.h"
+#include "graph/node.h"
+#include "proto/task.pb.h"
+
+namespace ge {
+class OpsKernelBuilder {
+ public:
+  OpsKernelBuilder() = default;
+  virtual ~OpsKernelBuilder() = default;
+
+  // initialize OpsKernelBuilder
+  virtual Status Initialize(const std::map<std::string, std::string> &options) = 0;
+
+  // finalize OpsKernelBuilder
+  virtual Status Finalize() = 0;
+
+  // memory allocation requirement
+  virtual Status CalcOpRunningParam(Node &node) = 0;
+
+  // generate task for op
+  virtual Status GenerateTask(const Node &node, RunContext &context, std::vector<domi::TaskDef> &tasks) = 0;
+
+  // only call aicpu interface to generate task struct
+  virtual Status GenSingleOpRunTask(const NodePtr &node, STR_FWK_OP_KERNEL &task, string &task_info) { return FAILED; }
+
+  // only call aicpu interface to generate task struct
+  virtual Status GenMemCopyTask(uint64_t count, STR_FWK_OP_KERNEL &task, string &task_info) { return FAILED; }
+};
+}  // namespace ge
+#endif  // INC_COMMON_OPSKERNELUTILS_OPS_KERNEL_INFO_UTILS_H_
diff --git a/inc/common/opskernel/ops_kernel_info_store.h b/inc/common/opskernel/ops_kernel_info_store.h
index ce1464d4..e8f7c2a1 100644
--- a/inc/common/opskernel/ops_kernel_info_store.h
+++ b/inc/common/opskernel/ops_kernel_info_store.h
@@ -43,10 +43,10 @@ class OpsKernelInfoStore {
   virtual ~OpsKernelInfoStore() {}
 
   // initialize opsKernelInfoStore
-  virtual Status Initialize(const map<string, string> &options) = 0; /*lint -e148*/
+  virtual Status Initialize(const map<string, string> &options) = 0;
 
   // close opsKernelInfoStore
-  virtual Status Finalize() = 0; /*lint -e148*/
+  virtual Status Finalize() = 0;
 
   virtual Status CreateSession(const std::map<std::string, std::string> &session_options) { return SUCCESS; }
 
@@ -65,24 +65,11 @@ class OpsKernelInfoStore {
   // opsFlag opsFlag[0] indicates constant folding is supported or not
   virtual void opsFlagCheck(const ge::Node &node, std::string &opsFlag){};
 
-  // memory allocation requirement
-  virtual Status CalcOpRunningParam(Node &node) = 0; /*lint -e148*/
-
-  // generate task for op。
-  virtual Status GenerateTask(const Node &node, RunContext &context,
-                              std::vector<domi::TaskDef> &tasks) = 0; /*lint -e148*/
-
   // only call fe engine interface to compile single op
   virtual Status CompileOp(vector<ge::NodePtr> &node_vec) { return SUCCESS; }
   virtual Status CompileOpRun(vector<ge::NodePtr> &node_vec) { return SUCCESS; }
   // load task for op
   virtual Status LoadTask(GETaskInfo &task) { return SUCCESS; }
-
-  // only call aicpu interface to generate task struct
-  virtual Status GenSingleOpRunTask(const NodePtr &node, STR_FWK_OP_KERNEL &task, string &task_info) { return SUCCESS; }
-
-  // only call aicpu interface to generate task struct
-  virtual Status GenMemCopyTask(uint64_t count, STR_FWK_OP_KERNEL &task, string &task_info) { return SUCCESS; }
 };
 }  // namespace ge
 #endif  // INC_COMMON_OPSKERNEL_OPS_KERNEL_INFO_STORE_H_
diff --git a/inc/common/opskernel/ops_kernel_info_types.h b/inc/common/opskernel/ops_kernel_info_types.h
index 684c1abc..097d2e47 100644
--- a/inc/common/opskernel/ops_kernel_info_types.h
+++ b/inc/common/opskernel/ops_kernel_info_types.h
@@ -26,13 +26,14 @@
 using std::string;
 
 namespace ge {
-/*lint -e148*/
 struct RunContext {
   rtModel_t model;
   rtStream_t stream;
   uint64_t sessionId;
   uint64_t dataMemSize;
   uint8_t *dataMemBase;
+  std::map<int64_t, uint64_t> mem_type_data_mem_size;
+  std::map<int64_t, uint8_t *> mem_type_data_mem_base;
   uint64_t weightMemSize;
   uint8_t *weightMemBase;
   ge::Buffer weightsBuffer;
@@ -41,8 +42,6 @@ struct RunContext {
   std::vector<rtLabel_t> graphLabelList;    // all labels of graph, order by ge label id(0,1,...)
 };
 
-/*lint +e148*/
-
 struct Task {
   uint32_t id;
   uint16_t type;
@@ -51,8 +50,7 @@ struct Task {
 };
 
 struct OpInfo {
-  string engine;  // which engin
-  /*lint -e148*/
+  string engine;       // which engin
   string opKernelLib;  // which opsKernelStore
   int computeCost;     // compute cost
   bool flagPartial;    // whether to support is related to shape
diff --git a/inc/common/optimizer/graph_optimizer.h b/inc/common/optimizer/graph_optimizer.h
index 253aaae1..32ea944c 100644
--- a/inc/common/optimizer/graph_optimizer.h
+++ b/inc/common/optimizer/graph_optimizer.h
@@ -27,7 +27,6 @@
 using std::map;
 using std::string;
 
-/*lint -e148*/
 namespace ge {
 class GraphOptimizer {
  public:
@@ -67,5 +66,4 @@ class GraphOptimizer {
   virtual Status OptimizeFusedGraphAfterGraphSlice(ComputeGraph &graph) { return SUCCESS; }
 };
 }  // namespace ge
-/*lint +e148*/
 #endif  // INC_COMMON_OPTIMIZER_GRAPH_OPTIMIZER_H_
diff --git a/inc/common/util/ai_core/aicore_manager/aicore_util_manager.h b/inc/common/util/ai_core/aicore_manager/aicore_util_manager.h
new file mode 100644
index 00000000..cb2f95bb
--- /dev/null
+++ b/inc/common/util/ai_core/aicore_manager/aicore_util_manager.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICORE_UTIL_MANAGER_H_
+#define AICORE_UTIL_MANAGER_H_
+
+#include <string>
+#include "register/graph_optimizer/graph_optimize_register_error_codes.h"
+
+namespace fe {
+class AICoreUtilManager {
+ public:
+  static AICoreUtilManager &Instance();
+  /*
+   * to initialize the aicore configuration
+   * param[in] the options of init
+   * param[in] engine Name
+   * param[in] socVersion soc version from ge
+   * return Status(SUCCESS/FAILED)
+   */
+  Status Initialize(const std::map<std::string, std::string> &options, std::string &soc_version);
+
+  /*
+   * to release the source of fusion manager
+   * return Status(SUCCESS/FAILED)
+   */
+  Status Finalize();
+
+ private:
+  AICoreUtilManager();
+  ~AICoreUtilManager();
+  bool is_init_;
+};
+}  // namespace fe
+#endif  // AICORE_UTIL_MANAGER_H
\ No newline at end of file
diff --git a/inc/common/util/ai_core/common/aicore_util_attr_define.h b/inc/common/util/ai_core/common/aicore_util_attr_define.h
index ba28d7b3..6321dfa8 100644
--- a/inc/common/util/ai_core/common/aicore_util_attr_define.h
+++ b/inc/common/util/ai_core/common/aicore_util_attr_define.h
@@ -36,6 +36,14 @@ static const std::string L1_OPTIMIZED = "l1_optimized";
 
 static const std::string L2_OPTIMIZED = "l2_optimized";
 
-static const std::string OP_SLICE_INFO = "_op_slice_info";
+static const std::string ATTR_NAME_UNKNOWN_SHAPE = "_unknown_shape";
+
+static const std::string ATTR_NAME_IS_UNKNOWN_GRAPH = "_fe_is_unknown_graph";
+
+static const std::string ATTR_NAME_IS_UNKNOWN_SHAPE_OP = "_fe_is_unknown_shape_op";
+
+static const std::string ATTR_NAME_TVM_CACHE_READ_MODE = "tvm_cache_read_mode";
+
+static const std::string ATTR_NAME_TBE_KERNEL_SIZE = "_tbeKernelSize";
 }  // namespace fe
 #endif
diff --git a/inc/common/util/ai_core/common/aicore_util_constants.h b/inc/common/util/ai_core/common/aicore_util_constants.h
new file mode 100644
index 00000000..2856fa5b
--- /dev/null
+++ b/inc/common/util/ai_core/common/aicore_util_constants.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_COMMON_UTILS_AI_CORE_COMMON_CONSTANTS_H_
+#define INC_COMMON_UTILS_AI_CORE_COMMON_CONSTANTS_H_
+
+#include <string>
+
+namespace fe {
+static const std::string CORE_TYPE = "_coretype";
+/* engine name of AI core and vector core */
+static const std::string AI_CORE_NAME = "AIcoreEngine";
+static const std::string VECTOR_CORE_NAME = "VectorEngine";
+
+static const int64_t IS_UNKNOWN_SHAPE_VALUE = 1;
+
+static const int64_t SHAPE_UNKNOWN_DIM = -1;
+
+static const int64_t SHAPE_UNKNOWN_DIM_NUM = -2;
+
+static const std::string SOC_VERSION_ASCEND310 = "Ascend310";
+static const std::string SOC_VERSION_ASCEND610 = "Ascend610";
+static const std::string SOC_VERSION_ASCEND615 = "Ascend615";
+static const std::string SOC_VERSION_ASCEND710 = "Ascend710";
+static const std::string SOC_VERSION_ASCEND710P = "Ascend710Pro";
+static const std::string SOC_VERSION_ASCEND910A = "Ascend910A";
+static const std::string SOC_VERSION_ASCEND910B = "Ascend910B";
+static const std::string SOC_VERSION_ASCEND910PROA = "Ascend910ProA";
+static const std::string SOC_VERSION_ASCEND910PROB = "Ascend910ProB";
+static const std::string SOC_VERSION_ASCEND910PREMIUMA = "Ascend910PremiumA";
+static const std::string SOC_VERSION_HI3796CV300ES = "Hi3796CV300ES";
+static const std::string SOC_VERSION_HI3796CV300CS = "Hi3796CV300CS";
+
+static const std::vector<std::string> SOC_VERSION_CLOUD_LIST = {SOC_VERSION_ASCEND910A, SOC_VERSION_ASCEND910B,
+                                                                SOC_VERSION_ASCEND910PROA, SOC_VERSION_ASCEND910PROB,
+                                                                SOC_VERSION_ASCEND910PREMIUMA};
+
+static const std::vector<std::string> SOC_VERSION_DC_LIST = {SOC_VERSION_ASCEND610, SOC_VERSION_ASCEND615,
+                                                             SOC_VERSION_ASCEND710, SOC_VERSION_ASCEND710P};
+}  // namespace fe
+#endif
diff --git a/inc/common/util/ai_core/common/aicore_util_types.h b/inc/common/util/ai_core/common/aicore_util_types.h
index b2615dc9..18f57715 100644
--- a/inc/common/util/ai_core/common/aicore_util_types.h
+++ b/inc/common/util/ai_core/common/aicore_util_types.h
@@ -42,47 +42,61 @@ struct FusionDataFlow {
   std::pair<std::string, ge::AnchorPtr> node_dataindex_pair;
 };
 
-typedef struct tagL2FusionData {
+typedef struct tag_l2_fusion_data {
   uint32_t l2Index;
   uint64_t l2Addr;
   uint64_t l2PageNum;
 } L2FusionData_t;
 typedef std::map<uint64_t, L2FusionData_t> L2FusionDataMap_t;
 
-typedef struct tagFeSmDesc {
+typedef struct tag_fe_sm_desc {
   rtL2Ctrl_t l2ctrl;
-  std::string nodeName[8];
-  uint8_t outputIndex[8];
-} feSmDesc_t;
+  std::string node_name[8];
+  uint8_t output_index[8];
+} fe_sm_desc_t;
 
 typedef struct TagTaskL2FusionInfo {
-  std::string nodeName;
-  feSmDesc_t l2Info;
+  std::string node_name;
+  fe_sm_desc_t l2_info;
   L2FusionDataMap_t input;
   L2FusionDataMap_t output;
-  uint32_t isUsed;
+  uint32_t is_used;
 } TaskL2FusionInfo_t;
 
 using L2FusionInfoPtr = std::shared_ptr<TaskL2FusionInfo_t>;
 
 typedef struct ToOpStruct {
-  int64_t opL1Space = 0;
-  std::vector<int64_t> opL1FusionType;
-  int64_t opL1WorkspaceFlag = 0;  // for workspace flag
-  int64_t opL1WorkspaceSize = 0;
-  std::vector<std::vector<int64_t>> validInputShape;
-  std::vector<std::vector<int64_t>> validOutputShape;
-  std::vector<std::vector<int64_t>> sliceInputOffset;   // conv & pooling & ReadSelect
-  std::vector<std::vector<int64_t>> sliceOutputOffset;  // WriteSelect
-  std::vector<uint32_t> totalShape;
-  uint32_t splitIndex = 0;
+  int64_t op_l1_space = 0;
+  std::vector<int64_t> op_l1_fusion_type;
+  int64_t op_l1_workspace_flag = 0;  // for workspace flag
+  int64_t op_l1_workspace_size = 0;
+  std::vector<std::vector<int64_t>> valid_input_shape;
+  std::vector<std::vector<int64_t>> valid_output_shape;
+  std::vector<std::vector<int64_t>> slice_input_offset;   // conv & pooling & ReadSelect
+  std::vector<std::vector<int64_t>> slice_output_offset;  // WriteSelect
+  std::vector<uint32_t> total_shape;
+  uint32_t split_index = 0;
   ToOpStruct() {
     // set invalid value for essential variable
-    opL1Space = -1;
-    opL1WorkspaceSize = -1;
+    op_l1_space = -1;
+    op_l1_workspace_size = -1;
   }
 } ToOpStruct_t;
 
+enum SlicePattern {
+  ELEMENT_WISE = 0,
+  ELEMENT_WISE_BROADCAST,
+  BROADCAST,
+  SLIDING_WINDOW,
+  SLIDING_WINDOW_DECONV,
+  CUBE_MATMUL,
+  SLICE_PATTERN_REDUCE,
+  SLICE_PATTERN_RESIZE,
+  SLICE_PATTERN_SCATTER,
+  SLICE_PATTERN_SEGMENT,
+  PATTERN_RESERVED
+};
+
 enum OpImplType {
   EN_IMPL_CUSTOM_CONSTANT_CCE = 0,    // custom constant op
   EN_IMPL_CUSTOM_TIK,                 // custom tik op
@@ -99,6 +113,10 @@ enum OpImplType {
   EN_RESERVED                         // reserved value
 };
 
+// Dont change the order, only add new mode in the end
+enum L2Mode { EN_L2_CLOSE = 0, EN_L2_BUFFER_OPTIMIZE, EN_L2_CACHE_NORMAL, EN_L2_CACHE_RC };
+enum BufferFusionMode { EN_OPTIMIZE_DISABLE = 0, EN_L2_BUFFER, EN_L2_FUSION };
+
 static const std::map<ge::DataType, uint32_t> DATATYPE_SIZE_MAP{{ge::DT_FLOAT, sizeof(float)},
                                                                 {ge::DT_FLOAT16, sizeof(int16_t)},
                                                                 {ge::DT_INT8, sizeof(int8_t)},
@@ -114,5 +132,13 @@ static const std::map<ge::DataType, uint32_t> DATATYPE_SIZE_MAP{{ge::DT_FLOAT, s
                                                                 {ge::DT_DUAL, sizeof(float) + sizeof(int8_t)},
                                                                 {ge::DT_DUAL_SUB_UINT8, sizeof(int8_t)},
                                                                 {ge::DT_DUAL_SUB_INT8, sizeof(int8_t)}};
+
+enum OpReduceType {
+  REDUCE_MEAN = 0,
+  REDUCE_ADD,
+  REDUCE_MAX,
+  REDUCE_MIN,
+};
+
 }  // namespace fe
 #endif
diff --git a/inc/common/util/ai_core/common/graph_comm.h b/inc/common/util/ai_core/common/graph_comm.h
index d672e056..2f061c54 100644
--- a/inc/common/util/ai_core/common/graph_comm.h
+++ b/inc/common/util/ai_core/common/graph_comm.h
@@ -28,33 +28,34 @@
 
 namespace fe {
 
-using kScopeNodeMap_t = std::map<int64_t, std::vector<ge::NodePtr>>;
-using kScopeNodePair_t = std::pair<int64_t, std::vector<ge::NodePtr>>;
+using k_scope_node_map_t = std::map<int64_t, std::vector<ge::NodePtr>>;
+using k_scope_node_pair_t = std::pair<int64_t, std::vector<ge::NodePtr>>;
 
 class GraphCommImpl;
 using GraphCommImplPtr = std::unique_ptr<GraphCommImpl>;
 
 class GraphComm {
  public:
-  GraphComm(const string &engineName);
+  GraphComm(const string &engine_name);
   virtual ~GraphComm();
   GraphComm(const GraphComm &in) = delete;
   GraphComm &operator=(const GraphComm &in) = delete;
 
-  Status GetscopeNodeMap(ge::ComputeGraph &graph, kScopeNodeMap_t &fusionMap);
+  Status GetscopeNodeMap(ge::ComputeGraph &graph, k_scope_node_map_t &fusion_map);
 
-  Status CopyFusionOpNodes(vector<FusionDataFlow> &fusInputEdgeList, vector<FusionDataFlow> &fusOutputEdgeList,
-                           vector<ge::NodePtr> &fusNodelist, ge::OpDescPtr fusionOpDesc,
-                           ge::ComputeGraphPtr fusionGraph);
+  Status CopyFusionOpNodes(vector<FusionDataFlow> &fus_input_edge_list, vector<FusionDataFlow> &fus_output_edge_list,
+                           vector<ge::NodePtr> &fus_nodelist, ge::OpDescPtr fusion_op_desc,
+                           ge::ComputeGraphPtr fusion_graph);
 
-  Status CopyFusionOpEdges(ge::OpDescPtr fusionOpDesc, ge::ComputeGraph &origGraph, ge::ComputeGraphPtr fusionGraph);
+  Status CopyFusionOpEdges(ge::OpDescPtr fusion_op_desc, ge::ComputeGraph &orig_graph,
+                           ge::ComputeGraphPtr fusion_graph);
 
-  Status GetNodeDataFlowMap(const ge::NodePtr &fusNode,
-                            std::map<ge::NodePtr, std::map<ge::AnchorPtr, ge::AnchorPtr>> &fusionOpAnchorsMap,
-                            ge::kFusionDataFlowVec_t &fusDataflowList, const int &mapType);
+  Status GetNodeDataFlowMap(const ge::NodePtr &fus_node,
+                            std::map<ge::NodePtr, std::map<ge::AnchorPtr, ge::AnchorPtr>> &fusion_op_anchors_map,
+                            ge::kFusionDataFlowVec_t &fus_dataflow_list, const int &map_type);
 
-  Status GetFusionNodeEdgeList(std::vector<ge::NodePtr> &fusNodelist, std::vector<FusionDataFlow> &fusInputEdgeList,
-                               std::vector<FusionDataFlow> &fusOutputEdgeList);
+  Status GetFusionNodeEdgeList(std::vector<ge::NodePtr> &fus_nodelist, std::vector<FusionDataFlow> &fus_input_edge_list,
+                               std::vector<FusionDataFlow> &fus_output_edge_list);
   void ClearFusionSrc();
 
   void ClearFusionDst();
@@ -72,25 +73,26 @@ class GraphComm {
   bool GetFusionSrc(const uint32_t &src_op_id, const ge::AnchorPtr &src_anchor, int32_t &fusion_src_index,
                     int32_t &fusion_dst_index);
 
-  Status GetFusionNodeCtrlEdgeList(vector<ge::NodePtr> &fusNodelist, vector<FusionDataFlow> &fusInputCtrlEdgeList,
-                                   vector<FusionDataFlow> &fusOutputCtrlEdgeList);
+  Status GetFusionNodeCtrlEdgeList(vector<ge::NodePtr> &fus_nodelist, vector<FusionDataFlow> &fus_input_ctrl_edge_list,
+                                   vector<FusionDataFlow> &fus_output_ctrl_edge_list);
 
-  Status MergeFusionNodeEdgeList(ge::NodePtr &fusNode, vector<ge::NodePtr> &fusNodelist,
-                                 vector<FusionDataFlow> &fusInputEdgeList, vector<FusionDataFlow> &fusOutputEdgeList);
+  Status MergeFusionNodeEdgeList(ge::NodePtr &fus_node, vector<ge::NodePtr> &fus_nodelist,
+                                 vector<FusionDataFlow> &fus_input_edge_list,
+                                 vector<FusionDataFlow> &fus_output_edge_list);
 
-  Status MergeFusionNodeCtrlEdgeList(ge::NodePtr &fusNode, vector<ge::NodePtr> &fusNodelist,
-                                     vector<FusionDataFlow> &fusInputEdgeList,
-                                     vector<FusionDataFlow> &fusOutputEdgeList);
+  Status MergeFusionNodeCtrlEdgeList(ge::NodePtr &fus_node, vector<ge::NodePtr> &fus_nodelist,
+                                     vector<FusionDataFlow> &fus_input_edge_list,
+                                     vector<FusionDataFlow> &fus_output_edge_list);
 
   string GetEngineName();
 
  private:
-  Status MergeFusionNodeInputEdgeList(ge::NodePtr fusNode, std::vector<ge::NodePtr> &fusNodelist,
-                                      std::vector<FusionDataFlow> &fusInputEdgeList);
-  Status MergeFusionNodeOutputEdgeList(ge::NodePtr fusNode, std::vector<ge::NodePtr> &fusNodelist,
-                                       std::vector<FusionDataFlow> &fusOutputEdgeList);
+  Status MergeFusionNodeInputEdgeList(ge::NodePtr fus_node, std::vector<ge::NodePtr> &fus_nodelist,
+                                      std::vector<FusionDataFlow> &fus_input_edge_list);
+  Status MergeFusionNodeOutputEdgeList(ge::NodePtr fus_node, std::vector<ge::NodePtr> &fus_nodelist,
+                                       std::vector<FusionDataFlow> &fus_output_edge_list);
 
-  string engineName_;
+  string engine_name_;
 
   std::vector<FusionOpSrc> exist_fusion_src_list_;
   std::vector<FusionOpDst> exist_fusion_dst_list_;
@@ -101,7 +103,7 @@ class GraphComm {
   // std::vector<std::multimap<std::string, ge::AnchorPtr>>
   ge::kFusionDataFlowVec_t fusion_output_dataflow_list_;
 
-  GraphCommImplPtr graphCommImplPtr_;
+  GraphCommImplPtr graph_comm_impl_ptr_;
 };
 }  // namespace fe
 #endif
diff --git a/inc/common/util/ai_core/common/json_util.h b/inc/common/util/ai_core/common/json_util.h
new file mode 100644
index 00000000..62c364b2
--- /dev/null
+++ b/inc/common/util/ai_core/common/json_util.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PROJECT_JSON_UTIL_H
+#define PROJECT_JSON_UTIL_H
+
+#include "graph/compute_graph.h"
+
+#include "common/aicore_util_types.h"
+#include "fusion_engine/graph_tuner/graph_tuner_errorcode.h"
+
+const std::string L1_FUSION_EXTEND_CONTENT = "_l1_fusion_extend_content";
+const std::string L2_FUSION_EXTEND_CONTENT = "l2_fusion_extend_content";
+const std::string TASK_L2_FUSION_INFO_EXTEND_CONTENT = "task_l2_fusion_info_extend_content";
+const std::string L1_FUSION_TO_OP_STRUCT = "_l1fusion_ToOpStruct";
+const std::string L2_FUSION_TO_OP_STRUCT = "_l2fusion_ToOpStruct";
+const std::string TASK_L2_FUSION_INFO = "_task_L2FusionInfo";
+
+namespace tune {
+using ToOpStructPtr = std::shared_ptr<fe::ToOpStruct_t>;
+using L2FusionInfoPtr = std::shared_ptr<fe::TaskL2FusionInfo_t>;
+
+Status GetL1InfoFromJson(ge::OpDescPtr opDescPtr);
+
+Status GetL2InfoFromJson(ge::OpDescPtr opDescPtr);
+
+Status GetTaskL2FusionInfoFromJson(ge::OpDescPtr opDescPtr);
+
+Status ReadGraphInfoFromJson(ge::ComputeGraph &graph);
+
+Status WriteGraphInfoToJson(ge::ComputeGraph &graph);
+
+void GetL2ToOpStructFromJson(ge::OpDescPtr &opDescPtr, ToOpStructPtr &l2InfoPtr);
+
+void GetL1ToOpStructFromJson(ge::OpDescPtr &opDescPtr, ToOpStructPtr &l1InfoPtr);
+
+L2FusionInfoPtr GetL2FusionInfoFromJson(ge::OpDescPtr &opDescPtr);
+
+void SetL2FusionInfoToNode(ge::OpDescPtr &opDescPtr, L2FusionInfoPtr &l2FusionInfoPtr);
+}  // namespace tune
+#endif  // PROJECT_JSON_UTIL_H
diff --git a/inc/common/util/ai_core/common/l2_stream_info.h b/inc/common/util/ai_core/common/l2_stream_info.h
new file mode 100644
index 00000000..1f3dbd05
--- /dev/null
+++ b/inc/common/util/ai_core/common/l2_stream_info.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef L2_STREAM_INFO_H_
+#define L2_STREAM_INFO_H_
+
+#include <map>
+#include <string>
+#include <mutex>
+#include "register/graph_optimizer/graph_optimize_register_error_codes.h"
+#include "runtime/base.h"
+#include "cce/l2fusion_struct.hpp"
+
+namespace fe {
+class StreamL2Info {
+ public:
+  StreamL2Info(const StreamL2Info &) = delete;
+  StreamL2Info &operator=(const StreamL2Info &) = delete;
+  static StreamL2Info &Instance();
+  Status GetStreamL2Info(rtStream_t stream_id, string node_name, fusion::TaskL2Info_t *&l2_data);
+  Status SetStreamL2Info(const rtStream_t &stream_id, fusion::TaskL2InfoFEMap_t &l2_alloc_res);
+
+ private:
+  StreamL2Info();
+  ~StreamL2Info();
+  mutable std::mutex stream_l2_mutex_;
+  std::map<rtStream_t, fusion::TaskL2InfoFEMap_t> stream_l2_map_;
+};
+}  // namespace fe
+
+#endif  // L2_STREAM_INFO_H_
\ No newline at end of file
diff --git a/inc/common/util/ai_core/common/scope_allocator.h b/inc/common/util/ai_core/common/scope_allocator.h
index 6cebb286..310b3f7c 100644
--- a/inc/common/util/ai_core/common/scope_allocator.h
+++ b/inc/common/util/ai_core/common/scope_allocator.h
@@ -32,12 +32,12 @@ class ScopeAllocator {
   int64_t GetCurrentScopeId();
   int64_t AllocateScopeId(void);
   bool HasScopeAttr(ge::ConstOpDescPtr opdef);
-  bool GetScopeAttr(ge::ConstOpDescPtr opdef, int64_t& scopeId);
-  bool SetScopeAttr(ge::OpDescPtr opdef, int64_t scopeId);
-  bool ResetScopeId(int64_t scopeId);
+  bool GetScopeAttr(ge::ConstOpDescPtr opdef, int64_t& scope_id);
+  bool SetScopeAttr(ge::OpDescPtr opdef, int64_t scope_id);
+  bool ResetScopeId(int64_t scope_id);
 
  private:
-  int64_t scopeId;
+  int64_t scope_id;
 };
 }  // namespace fe
 #endif
diff --git a/inc/common/util/ai_core/param_calculate/tensorsize_calculator.h b/inc/common/util/ai_core/param_calculate/tensorsize_calculator.h
index c82cca4b..5822539c 100644
--- a/inc/common/util/ai_core/param_calculate/tensorsize_calculator.h
+++ b/inc/common/util/ai_core/param_calculate/tensorsize_calculator.h
@@ -29,16 +29,16 @@ class TensorSizeCalculator {
  public:
   /**
    * Calculate the tensor size of input and output of each opdesc
-   * @param opDesc opdesc object
-   * @param opImplType op impl type
+   * @param op_desc opdesc object
+   * @param op_impl_type op impl type
    * @return status SUCCESS or FAILED
    */
-  static Status CalculateOpTensorSize(ge::OpDesc &opDesc);
+  static Status CalculateOpTensorSize(ge::OpDesc &op_desc);
 
  private:
-  static Status CalcInputOpTensorSize(ge::OpDesc &opDesc, int32_t &outputRealCalcFlag);
+  static Status CalcInputOpTensorSize(ge::OpDesc &op_desc, int32_t &output_real_calc_flag);
 
-  static Status CalcOutputOpTensorSize(ge::OpDesc &opDesc, int32_t &outputRealCalcFlag);
+  static Status CalcOutputOpTensorSize(ge::OpDesc &op_desc, int32_t &output_real_calc_flag);
 };
 }  // namespace fe
 
diff --git a/inc/common/util/error_manager/error_manager.h b/inc/common/util/error_manager/error_manager.h
index 438e68a7..185f84cc 100644
--- a/inc/common/util/error_manager/error_manager.h
+++ b/inc/common/util/error_manager/error_manager.h
@@ -20,6 +20,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <mutex>
 
 class ErrorManager {
  public:
@@ -86,6 +87,7 @@ class ErrorManager {
   int ReadJsonFile(const std::string &file_path, void *handle);
 
   bool is_init_ = false;
+  std::mutex mutex_;
   std::map<std::string, ErrorInfo> error_map_;
   std::vector<std::string> error_messages_;
   std::vector<std::string> warning_messages_;
diff --git a/inc/common/util/platform_info.h b/inc/common/util/platform_info.h
index 8d2a0579..b278183a 100644
--- a/inc/common/util/platform_info.h
+++ b/inc/common/util/platform_info.h
@@ -36,66 +36,66 @@ class PlatformInfoManager {
   uint32_t InitializePlatformInfo();
   uint32_t Finalize();
 
-  uint32_t GetPlatformInfo(const string SoCVersion, PlatformInfo &platformInfo, OptionalInfo &optiCompilationInfo);
+  uint32_t GetPlatformInfo(const string SoCVersion, PlatformInfo &platform_info, OptionalInfo &opti_compilation_info);
 
-  uint32_t GetPlatformInfoWithOutSocVersion(PlatformInfo &platformInfo, OptionalInfo &optiCompilationInfo);
+  uint32_t GetPlatformInfoWithOutSocVersion(PlatformInfo &platform_info, OptionalInfo &opti_compilation_info);
 
-  void SetOptionalCompilationInfo(OptionalInfo &optiCompilationInfo);
+  void SetOptionalCompilationInfo(OptionalInfo &opti_compilation_info);
 
  private:
   PlatformInfoManager();
   ~PlatformInfoManager();
 
-  uint32_t LoadIniFile(string iniFileRealPath);
+  uint32_t LoadIniFile(string ini_file_real_path);
 
   void Trim(string &str);
 
-  uint32_t LoadConfigFile(string realPath);
+  uint32_t LoadConfigFile(string real_path);
 
   string RealPath(const std::string &path);
 
   string GetSoFilePath();
 
-  void ParseVersion(map<string, string> &versionMap, string &socVersion, PlatformInfo &platformInfoTemp);
+  void ParseVersion(map<string, string> &version_map, string &soc_version, PlatformInfo &platform_info_temp);
 
-  void ParseSocInfo(map<string, string> &socInfoMap, PlatformInfo &platformInfoTemp);
+  void ParseSocInfo(map<string, string> &soc_info_map, PlatformInfo &platform_info_temp);
 
-  void ParseCubeOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+  void ParseCubeOfAICoreSpec(map<string, string> &ai_core_spec_map, PlatformInfo &platform_info_temp);
 
-  void ParseBufferOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+  void ParseBufferOfAICoreSpec(map<string, string> &ai_core_spec_map, PlatformInfo &platform_info_temp);
 
-  void ParseUBOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+  void ParseUBOfAICoreSpec(map<string, string> &ai_core_spec_map, PlatformInfo &platform_info_temp);
 
-  void ParseUnzipOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+  void ParseUnzipOfAICoreSpec(map<string, string> &ai_core_spec_map, PlatformInfo &platform_info_temp);
 
-  void ParseAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+  void ParseAICoreSpec(map<string, string> &ai_core_spec_map, PlatformInfo &platform_info_temp);
 
-  void ParseBufferOfAICoreMemoryRates(map<string, string> &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+  void ParseBufferOfAICoreMemoryRates(map<string, string> &ai_core_memory_rates_map, PlatformInfo &platform_info_temp);
 
-  void ParseAICoreMemoryRates(map<string, string> &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+  void ParseAICoreMemoryRates(map<string, string> &ai_core_memory_rates_map, PlatformInfo &platform_info_temp);
 
-  void ParseUBOfAICoreMemoryRates(map<string, string> &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+  void ParseUBOfAICoreMemoryRates(map<string, string> &ai_core_memory_rates_map, PlatformInfo &platform_info_temp);
 
-  void ParseAICoreintrinsicDtypeMap(map<string, string> &aiCoreintrinsicDtypeMap, PlatformInfo &platformInfoTemp);
+  void ParseAICoreintrinsicDtypeMap(map<string, string> &ai_coreintrinsic_dtype_map, PlatformInfo &platform_info_temp);
 
-  void ParseVectorCoreSpec(map<string, string> &vectorCoreSpecMap, PlatformInfo &platformInfoTemp);
+  void ParseVectorCoreSpec(map<string, string> &vector_core_spec_map, PlatformInfo &platform_info_temp);
 
-  void ParseVectorCoreMemoryRates(map<string, string> &vectorCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+  void ParseVectorCoreMemoryRates(map<string, string> &vector_core_memory_rates_map, PlatformInfo &platform_info_temp);
 
-  void ParseCPUCache(map<string, string> &CPUCacheMap, PlatformInfo &platformInfoTemp);
+  void ParseCPUCache(map<string, string> &CPUCacheMap, PlatformInfo &platform_info_temp);
 
-  void ParseVectorCoreintrinsicDtypeMap(map<string, string> &vectorCoreintrinsicDtypeMap,
-                                        PlatformInfo &platformInfoTemp);
+  void ParseVectorCoreintrinsicDtypeMap(map<string, string> &vector_coreintrinsic_dtype_map,
+                                        PlatformInfo &platform_info_temp);
 
-  uint32_t ParsePlatformInfoFromStrToStruct(map<string, map<string, string>> &contentInfoMap, string &socVersion,
-                                            PlatformInfo &platformInfoTemp);
+  uint32_t ParsePlatformInfoFromStrToStruct(map<string, map<string, string>> &content_info_map, string &soc_version,
+                                            PlatformInfo &platform_info_temp);
 
-  uint32_t AssemblePlatformInfoVector(map<string, map<string, string>> &contentInfoMap);
+  uint32_t AssemblePlatformInfoVector(map<string, map<string, string>> &content_info_map);
 
  private:
-  bool initFlag_;
-  map<string, PlatformInfo> platformInfoMap_;
-  OptionalInfo optiCompilationInfo_;
+  bool init_flag_;
+  map<string, PlatformInfo> platform_info_map_;
+  OptionalInfo opti_compilation_info_;
 };
 }  // namespace fe
 #endif
diff --git a/inc/common/util/platform_info_def.h b/inc/common/util/platform_info_def.h
index c660e8f1..76371506 100644
--- a/inc/common/util/platform_info_def.h
+++ b/inc/common/util/platform_info_def.h
@@ -30,111 +30,113 @@ enum MemoryType { DDR = 0, HBM };
 
 enum L2Type { Cache = 0, Buff };
 
-typedef struct tagStrInfo {
-  string aicVersion;
-  string ccecAICVersion;
-  string ccecAIVVersion;
-  string isSupportAIcpuCompiler;
+typedef struct tag_str_info {
+  string aic_version;
+  string ccec_aic_version;
+  string ccec_aiv_version;
+  string is_support_ai_cpu_compiler;
 } StrInfo;
 
-typedef struct tagSoCInfo {
-  uint32_t aiCoreCnt;
-  uint32_t vectorCoreCnt;
-  uint32_t aiCpuCnt;
-  MemoryType memoryType;
-  uint64_t memorySize;
-  L2Type l2Type;
-  uint64_t l2Size;
+typedef struct tag_so_c_info {
+  uint32_t ai_core_cnt;
+  uint32_t vector_core_cnt;
+  uint32_t ai_cpu_cnt;
+  MemoryType memory_type;
+  uint64_t memory_size;
+  L2Type l2_type;
+  uint64_t l2_size;
   uint32_t l2PageNum;
 } SoCInfo;
 
-typedef struct tagAiCoreSpec {
-  double cubeFreq;
-  uint64_t cubeMSize;
-  uint64_t cubeNSize;
-  uint64_t cubeKSize;
-  uint64_t vecCalcSize;
-  uint64_t l0ASize;
-  uint64_t l0BSize;
-  uint64_t l0CSize;
-  uint64_t l1Size;
-  uint64_t smaskBuffer;
-  uint64_t ubSize;
-  uint64_t ubblockSize;
-  uint64_t ubbankSize;
-  uint64_t ubbankNum;
-  uint64_t ubburstInOneBlock;
-  uint64_t ubbankGroupNum;
-  uint32_t unzipEngines;
-  uint32_t unzipMaxRatios;
-  uint32_t unzipChannels;
-  uint8_t unzipIsTight;
+typedef struct tag_ai_core_spec {
+  double cube_freq;
+  uint64_t cube_m_size;
+  uint64_t cube_n_size;
+  uint64_t cube_k_size;
+  uint64_t vec_calc_size;
+  uint64_t l0_a_size;
+  uint64_t l0_b_size;
+  uint64_t l0_c_size;
+  uint64_t l1_size;
+  uint64_t smask_buffer;
+  uint64_t ub_size;
+  uint64_t ubblock_size;
+  uint64_t ubbank_size;
+  uint64_t ubbank_num;
+  uint64_t ubburst_in_one_block;
+  uint64_t ubbank_group_num;
+  uint32_t unzip_engines;
+  uint32_t unzip_max_ratios;
+  uint32_t unzip_channels;
+  uint8_t unzip_is_tight;
+  uint8_t cube_vector_split;
 } AiCoreSpec;
 
-typedef struct tagAiCoreMemoryRates {
-  double ddrRate;
-  double ddrReadRate;
-  double ddrWriteRate;
-  double l2Rate;
-  double l2ReadRate;
-  double l2WriteRate;
-  double l1ToL0ARate;
-  double l1ToL0BRate;
-  double l1ToUBRate;
-  double l0CToUBRate;
-  double ubToL2Rate;
-  double ubToDdrRate;
-  double ubToL1Rate;
+typedef struct tag_ai_core_memory_rates {
+  double ddr_rate;
+  double ddr_read_rate;
+  double ddr_write_rate;
+  double l2_rate;
+  double l2_read_rate;
+  double l2_write_rate;
+  double l1_to_l0_a_rate;
+  double l1_to_l0_b_rate;
+  double l1_to_ub_rate;
+  double l0_c_to_ub_rate;
+  double ub_to_l2_rate;
+  double ub_to_ddr_rate;
+  double ub_to_l1_rate;
 } AiCoreMemoryRates;
 
-typedef struct tagVectorCoreSpec {
-  double vecFreq;
-  uint64_t vecCalcSize;
-  uint64_t smaskBuffer;
-  uint64_t ubSize;
-  uint64_t ubblockSize;
-  uint64_t ubbankSize;
-  uint64_t ubbankNum;
-  uint64_t ubburstInOneBlock;
-  uint64_t ubbankGroupNum;
-  uint64_t vectorRegSize;
-  uint64_t predicateRegSize;
-  uint64_t addressRegSize;
+typedef struct tag_vector_core_spec {
+  double vec_freq;
+  uint64_t vec_calc_size;
+  uint64_t smask_buffer;
+  uint64_t ub_size;
+  uint64_t ubblock_size;
+  uint64_t ubbank_size;
+  uint64_t ubbank_num;
+  uint64_t ubburst_in_one_block;
+  uint64_t ubbank_group_num;
+  uint64_t vector_reg_size;
+  uint64_t predicate_reg_size;
+  uint64_t address_reg_size;
+  uint64_t alignment_reg_size;
 } VectorCoreSpec;
 
-typedef struct tagVectorCoreMemoryRates {
-  double ddrRate;
-  double ddrReadRate;
-  double ddrWriteRate;
-  double l2Rate;
-  double l2ReadRate;
-  double l2WriteRate;
-  double ubToL2Rate;
-  double ubToDdrRate;
+typedef struct tag_vector_core_memory_rates {
+  double ddr_rate;
+  double ddr_read_rate;
+  double ddr_write_rate;
+  double l2_rate;
+  double l2_read_rate;
+  double l2_write_rate;
+  double ub_to_l2_rate;
+  double ub_to_ddr_rate;
 } VectorCoreMemoryRates;
 
-typedef struct tagCPUCache {
+typedef struct tag_cpu_cache {
   uint32_t AICPUSyncBySW;
   uint32_t TSCPUSyncBySW;
 } CPUCache;
 
-typedef struct tagPlatformInfo {
-  StrInfo strInfo;
-  SoCInfo socInfo;
-  AiCoreSpec aiCoreSpec;
-  AiCoreMemoryRates aiCoreMemoryRates;
-  map<string, vector<string>> aiCoreIntrinsicDtypeMap;
-  VectorCoreSpec vectorCoreSpec;
-  VectorCoreMemoryRates vectorCoreMemoryRates;
+typedef struct tag_platform_info {
+  StrInfo str_info;
+  SoCInfo soc_info;
+  AiCoreSpec ai_core_spec;
+  AiCoreMemoryRates ai_core_memory_rates;
+  map<string, vector<string>> ai_core_intrinsic_dtype_map;
+  VectorCoreSpec vector_core_spec;
+  VectorCoreMemoryRates vector_core_memory_rates;
   CPUCache cpucache;
-  map<string, vector<string>> vectorCoreIntrinsicDtypeMap;
+  map<string, vector<string>> vector_core_intrinsic_dtype_map;
 } PlatformInfo;
 
-typedef struct tagOptionalInfo {
-  string socVersion;
-  string coreType;
-  uint32_t aiCoreNum;
-  string l1FusionFlag;
+typedef struct tag_optional_info {
+  string soc_version;
+  string core_type;
+  uint32_t ai_core_num;
+  string l1_fusion_flag;
 } OptionalInfo;
 }  // namespace fe
 #endif
diff --git a/inc/external/ge/ge_api_error_codes.h b/inc/external/ge/ge_api_error_codes.h
index 7b045d54..e7f52724 100644
--- a/inc/external/ge/ge_api_error_codes.h
+++ b/inc/external/ge/ge_api_error_codes.h
@@ -70,7 +70,7 @@ using Status = uint32_t;
 
 // General error code
 GE_ERRORNO(0, 0, 0, 0, 0, SUCCESS, 0, "success");
-GE_ERRORNO(0b11, 0b11, 0b111, 0xFF, 0b11111, FAILED, 0xFFF, "failed"); /*lint !e401*/
+GE_ERRORNO(0b11, 0b11, 0b111, 0xFF, 0b11111, FAILED, 0xFFF, "failed");
 }  // namespace ge
 
 #endif  // INC_EXTERNAL_GE_GE_API_ERROR_CODES_H_
diff --git a/inc/external/ge/ge_ir_build.h b/inc/external/ge/ge_ir_build.h
index acf6991a..f3b3a3a3 100644
--- a/inc/external/ge/ge_ir_build.h
+++ b/inc/external/ge/ge_ir_build.h
@@ -89,5 +89,26 @@ graphStatus aclgrphSaveModel(const string &output_file, const ModelBufferData &m
  */
 graphStatus aclgrphGetIRVersion(int *major_version, int *minor_version, int *patch_version);
 
+/**
+ * @ingroup AscendCL
+ * @brief infer shape and data type
+ *
+ * @param graph[IN] the graph ready to build
+ * @retval GRAPH_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+graphStatus aclgrphInferShapeAndType(ge::Graph &graph);
+
+/**
+ * @ingroup AscendCL
+ * @brief dump graph
+ *
+ * @param graph[IN] the graph ready to build
+ * @param file[IN] file path
+ * @param file[IN] file path string len
+ * @retval GRAPH_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const size_t len);
 };  // namespace ge
 #endif
diff --git a/inc/external/graph/ascend_string.h b/inc/external/graph/ascend_string.h
new file mode 100644
index 00000000..04bf31ac
--- /dev/null
+++ b/inc/external/graph/ascend_string.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_EXTERNAL_GRAPH_ASCEND_STRING_H_
+#define INC_EXTERNAL_GRAPH_ASCEND_STRING_H_
+
+#include <string>
+#include <memory>
+
+namespace ge {
+class AscendString {
+ public:
+  AscendString() = default;
+
+  ~AscendString() = default;
+
+  explicit AscendString(const char* name);
+
+  const char* GetString() const;
+
+ private:
+  std::shared_ptr<std::string> name_;
+};
+}  // namespace ge
+#endif  // INC_EXTERNAL_GRAPH_ASCEND_STRING_H_
diff --git a/inc/external/graph/attr_value.h b/inc/external/graph/attr_value.h
index af430f9b..32fce04c 100644
--- a/inc/external/graph/attr_value.h
+++ b/inc/external/graph/attr_value.h
@@ -34,7 +34,6 @@ using std::vector;
 
 namespace ge {
 class AttrValueImpl;
-/*lint -e148*/
 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY AttrValue {
  public:
   using INT = int64_t;
@@ -70,6 +69,5 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY AttrValue {
   VALUE_SET_GET_DEC(AttrValue::FLOAT)
 #undef VALUE_SET_GET_DEC
 };
-/*lint +e148*/
 }  // namespace ge
 #endif  // INC_EXTERNAL_GRAPH_ATTR_VALUE_H_
diff --git a/inc/external/graph/ge_error_codes.h b/inc/external/graph/ge_error_codes.h
index d815a22d..26e0d70b 100644
--- a/inc/external/graph/ge_error_codes.h
+++ b/inc/external/graph/ge_error_codes.h
@@ -33,6 +33,7 @@ using graphStatus = uint32_t;
 const graphStatus GRAPH_FAILED = 0xFFFFFFFF;
 const graphStatus GRAPH_SUCCESS = 0;
 const graphStatus GRAPH_PARAM_INVALID = 50331649;
+const graphStatus GRAPH_NODE_WITHOUT_CONST_INPUT = 50331648;
 }  // namespace ge
 
 #endif  // INC_EXTERNAL_GRAPH_GE_ERROR_CODES_H_
diff --git a/inc/external/graph/gnode.h b/inc/external/graph/gnode.h
new file mode 100644
index 00000000..6ed39dd1
--- /dev/null
+++ b/inc/external/graph/gnode.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_EXTERNAL_GRAPH_NODE_H_
+#define INC_EXTERNAL_GRAPH_NODE_H_
+
+#include <vector>
+#include <cstdint>
+
+#include "./ge_error_codes.h"
+#include "./types.h"
+#include "./tensor.h"
+#include "./ascend_string.h"
+
+namespace ge {
+class AttrValue;
+class GNode;
+class OpDesc;
+class Graph;
+class ComputeGraph;
+using GNodePtr = std::shared_ptr<GNode>;
+using GraphPtr = std::shared_ptr<Graph>;
+using OpBytes = std::vector<uint8_t>;
+using OpDescPtr = std::shared_ptr<OpDesc>;
+using ComputeGraphPtr = std::shared_ptr<ComputeGraph>;
+
+class NodeImpl;
+class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GNode {
+ public:
+  GNode();
+
+  ~GNode() = default;
+
+  graphStatus GetType(ge::AscendString &type) const;
+
+  graphStatus GetName(ge::AscendString &name) const;
+
+  std::pair<GNodePtr, int32_t> GetInDataNodesAndPortIndexs(const int32_t index) const;
+
+  std::vector<GNodePtr> GetInControlNodes() const;
+
+  std::vector<std::pair<GNodePtr, int32_t>> GetOutDataNodesAndPortIndexs(const int32_t index) const;
+
+  std::vector<GNodePtr> GetOutControlNodes() const;
+
+  graphStatus GetInputConstData(const int32_t index, Tensor &data) const;
+
+  graphStatus GetInputIndexByName(const ge::AscendString &name, int32_t &index);
+
+  graphStatus GetOutputIndexByName(const ge::AscendString &name, int32_t &index);
+
+  size_t GetInputsSize() const;
+
+  size_t GetOutputsSize() const;
+
+  graphStatus GetInputDesc(const int32_t index, TensorDesc &tensor_desc) const;
+
+  graphStatus UpdateInputDesc(const int32_t index, const TensorDesc &tensor_desc);
+
+  graphStatus GetOutputDesc(const int32_t index, TensorDesc &tensor_desc) const;
+
+  graphStatus UpdateOutputDesc(const int32_t index, const TensorDesc &tensor_desc);
+
+  graphStatus GetAttr(const ge::AscendString &name, int64_t &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, int32_t &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, uint32_t &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, float &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, ge::AscendString &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, bool &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, Tensor &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<int64_t> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<int32_t> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<uint32_t> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<float> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<ge::AscendString> &attr_values) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<bool> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<Tensor> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, OpBytes &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<std::vector<int64_t>> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, std::vector<ge::DataType> &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, ge::DataType &attr_value) const;
+  graphStatus GetAttr(const ge::AscendString &name, AttrValue &attr_value) const;
+
+  graphStatus SetAttr(const ge::AscendString &name, int64_t &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, int32_t &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, uint32_t &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, float &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, ge::AscendString &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, bool &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, Tensor &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<int64_t> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<int32_t> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<uint32_t> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<float> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<ge::AscendString> &attr_values) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<bool> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<Tensor> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, OpBytes &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<std::vector<int64_t>> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, std::vector<ge::DataType> &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, ge::DataType &attr_value) const;
+  graphStatus SetAttr(const ge::AscendString &name, AttrValue &attr_value) const;
+
+  bool HasAttr(const ge::AscendString &name);
+
+  graphStatus GetSubgraph(uint32_t index, GraphPtr graph) const;
+
+  graphStatus GetALLSubgraphs(std::vector<GraphPtr> graph_list) const;
+
+ private:
+  std::shared_ptr<NodeImpl> impl_;
+  friend class NodeAdapter;
+};
+}  // namespace ge
+
+#endif  // INC_EXTERNAL_GRAPH_NODE_H_
diff --git a/inc/external/graph/graph.h b/inc/external/graph/graph.h
index 30886733..c6d1346b 100644
--- a/inc/external/graph/graph.h
+++ b/inc/external/graph/graph.h
@@ -23,11 +23,14 @@
 #include <vector>
 
 #include "./operator.h"
+#include "./gnode.h"
 
 namespace ge {
+class Graph;
 class GraphImpl;
 
 using GraphImplPtr = std::shared_ptr<GraphImpl>;
+using GraphPtr = std::shared_ptr<Graph>;
 
 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Graph {
   friend class GraphUtils;
@@ -53,15 +56,15 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Graph {
 
   graphStatus AddOp(const ge::Operator &op);
 
-  graphStatus FindOpByName(const string &name, ge::Operator &op) const;
+  graphStatus FindOpByName(const std::string &name, ge::Operator &op) const;
 
-  graphStatus FindOpByType(const string &type, std::vector<ge::Operator> &ops) const;
+  graphStatus FindOpByType(const std::string &type, std::vector<ge::Operator> &ops) const;
 
-  graphStatus GetAllOpName(std::vector<string> &op_name) const;
+  graphStatus GetAllOpName(std::vector<std::string> &op_name) const;
 
-  graphStatus SaveToFile(const string &file_name) const;
+  graphStatus SaveToFile(const std::string &file_name) const;
 
-  graphStatus LoadFromFile(const string &file_name);
+  graphStatus LoadFromFile(const std::string &file_name);
 
   const std::string &GetName() const;
 
@@ -73,6 +76,22 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Graph {
   ///
   void SetNeedIteration(bool need_iteration);
 
+  std::vector<GNode> GetAllNodes() const;
+
+  std::vector<GNode> GetDirectNode() const;
+
+  graphStatus RemoveNode(GNode &node);
+
+  graphStatus RemoveEdge(GNode &src_node, const int32_t src_port_index, GNode &dst_node, const int32_t dst_port_index);
+
+  GNode AddNodeByOp(const Operator &op);
+
+  graphStatus AddDataEdge(GNode &src_node, const int32_t src_port_index, GNode &dst_node, const int32_t dst_port_index);
+
+  graphStatus AddControlEdge(GNode &src_node, GNode &dst_node);
+
+  static GraphPtr ConstructFromInputs(const std::vector<Operator> &inputs, const ge::AscendString &name);
+
  private:
   GraphImplPtr impl_{nullptr};
 };
diff --git a/inc/external/graph/operator.h b/inc/external/graph/operator.h
index 81d726eb..042898c0 100644
--- a/inc/external/graph/operator.h
+++ b/inc/external/graph/operator.h
@@ -63,7 +63,6 @@ using std::function;
 using std::shared_ptr;
 using std::string;
 
-/*lint -e148*/
 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
  public:
   friend class OperatorImpl;
@@ -91,7 +90,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
 
   explicit Operator(const string &type);
 
-  Operator(const string &name, const string &type);  // lint !e148
+  Operator(const string &name, const string &type);
 
   virtual ~Operator() = default;
 
@@ -104,7 +103,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
   // Only has one output index = 0
   Operator &SetInput(const string &dst_name, const Operator &src_oprt);
 
-  Operator &SetInput(const string &dst_name, const Operator &src_oprt, const string &name);  // lint !e148
+  Operator &SetInput(const string &dst_name, const Operator &src_oprt, const string &name);
 
   Operator &SetInput(const string &dst_name, const Operator &src_oprt, uint32_t index);
 
@@ -128,22 +127,22 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
 
   TensorDesc GetOutputDesc(uint32_t index) const;
 
-  graphStatus UpdateOutputDesc(const string &name, const TensorDesc &tensor_desc);  // lint !e148
+  graphStatus UpdateOutputDesc(const string &name, const TensorDesc &tensor_desc);
 
   TensorDesc GetDynamicInputDesc(const string &name, uint32_t index) const;
 
-  graphStatus UpdateDynamicInputDesc(const string &name, uint32_t index, const TensorDesc &tensor_desc);  // lint !e148
+  graphStatus UpdateDynamicInputDesc(const string &name, uint32_t index, const TensorDesc &tensor_desc);
 
   TensorDesc GetDynamicOutputDesc(const string &name, uint32_t index) const;
 
-  graphStatus UpdateDynamicOutputDesc(const string &name, uint32_t index, const TensorDesc &tensor_desc);  // lint !e148
+  graphStatus UpdateDynamicOutputDesc(const string &name, uint32_t index, const TensorDesc &tensor_desc);
 
-  graphStatus InferShapeAndType();  // lint !e148
+  graphStatus InferShapeAndType();
 
   void SetInferenceContext(const InferenceContextPtr &inference_context);
   InferenceContextPtr GetInferenceContext() const;
 
-  graphStatus VerifyAllAttr(bool disable_common_verifier = false);  // lint !e148
+  graphStatus VerifyAllAttr(bool disable_common_verifier = false);
 
   size_t GetInputsSize() const;
 
@@ -256,20 +255,19 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
 
   void RequiredAttrRegister(const string &name);
 
-  graphStatus VerifyAll();  // lint !e148
+  graphStatus VerifyAll();
 
   // Only has one output index = 0
   Operator &SetInput(const string &dst_name, uint32_t dst_index, const Operator &src_oprt);
 
-  Operator &SetInput(const string &dst_name, uint32_t dst_index, const Operator &src_oprt,
-                     const string &name);  // lint !e148
+  Operator &SetInput(const string &dst_name, uint32_t dst_index, const Operator &src_oprt, const string &name);
 
   void SubgraphRegister(const string &ir_name, bool dynamic);
   void SubgraphCountRegister(const string &ir_name, uint32_t count);
   void SetSubgraphBuilder(const string &ir_name, uint32_t index, const SubgraphBuilder &builder);
 
  private:
-  Operator &SetInput(const string &dst_name, const OutHandler &out_handler);  // lint !e148
+  Operator &SetInput(const string &dst_name, const OutHandler &out_handler);
 
   OutHandler GetOutput(const string &name) const;
 
@@ -283,7 +281,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
 
   std::shared_ptr<const Node> GetNode() const;
 };
-/*lint +e148*/
 }  // namespace ge
 
 #endif  // INC_EXTERNAL_GRAPH_OPERATOR_H_
diff --git a/inc/external/graph/tensor.h b/inc/external/graph/tensor.h
index 800e1037..5174c248 100644
--- a/inc/external/graph/tensor.h
+++ b/inc/external/graph/tensor.h
@@ -126,6 +126,5 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Tensor {
   friend class TensorAdapter;
 };
 }  // namespace ge
-/*lint +e148*/
 
 #endif  // INC_EXTERNAL_GRAPH_TENSOR_H_
diff --git a/inc/external/hccl/hccl.h b/inc/external/hccl/hccl.h
new file mode 100644
index 00000000..46d934e6
--- /dev/null
+++ b/inc/external/hccl/hccl.h
@@ -0,0 +1,134 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hccl.h
+ * @brief HCCL API
+ */
+
+#ifndef HCCL_H_
+#define HCCL_H_
+
+#include <hccl/hccl_types.h>
+#include <acl/acl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/**
+ * @brief Initialize HCCL.
+ *
+ * @param clusterInfo A string identifying the cluster info file path, include file name.
+ * @param rank A integer identifying the identify for the rank.
+ * @param comm A pointer identifying the initialized communication resource.
+ * @return HcclResult
+ * @see HcclCommDestroy()
+ */
+extern HcclResult HcclCommInitClusterInfo(const char *clusterInfo, uint32_t rank, HcclComm *comm);
+
+/**
+ * @brief Get hccl root info.
+ *
+ * @param rootInfo A pointer identifying the hccl root info.
+ * @return HcclResult
+ */
+extern HcclResult HcclGetRootInfo(HcclRootInfo *rootInfo);
+
+/**
+ * @brief Initialize HCCL with root info.
+ *
+ * @param nRanks A integer identifying the rank size of the cluster.
+ * @param rootInfo A struct identifying the hccl root info.
+ * @param rank A integer identifying the identify for the rank.
+ * @param comm A pointer identifying the initialized communication resource.
+ * @return HcclResult
+ * @see HcclCommDestroy()
+ */
+extern HcclResult HcclCommInitRootInfo(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclComm *comm);
+
+/**
+ * @brief AllReduce operator.
+ *
+ * @param sendBuf A pointer identifying the input data address of the operator.
+ * @param recvBuf A pointer identifying the output data address of the operator.
+ * @param count An integer(u64) identifying the number of the output data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int16, int32, float16,
+ * float32.
+ * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
+ * @param comm A pointer identifying the communication resource based on.
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult
+ */
+extern HcclResult HcclAllReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, HcclReduceOp op,
+                                HcclComm comm, aclrtStream stream);
+
+/**
+ * @brief Broadcast operator.
+ *
+ * @param buf A pointer identifying the data address of the operator.
+ * @param count An integer(u64) identifying the number of the data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
+ * @param root An integer(u32) identifying the the root rank in the operator.
+ * @param comm A pointer identifying the communication resource based on
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult
+ */
+extern HcclResult HcclBroadcast(void *buf, uint64_t count, HcclDataType dataType, uint32_t root, HcclComm comm,
+                                aclrtStream stream);
+
+/**
+ * @brief ReduceScatter operator.
+ *
+ * @param sendBuf A pointer identifying the input data address of the operator.
+ * @param recvBuf A pointer identifying the output data address of the operator.
+ * @param recvCount An integer(u64) identifying the number of the output data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
+ * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
+ * @param comm A pointer identifying the communication resource based on.
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult
+ */
+extern HcclResult HcclReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, HcclDataType dataType,
+                                    HcclReduceOp op, HcclComm comm, aclrtStream stream);
+
+/**
+ * @brief AllGather operator.
+ *
+ * @param sendBuf A pointer identifying the input data address of the operator.
+ * @param recvBuf A pointer identifying the output data address of the operator.
+ * @param sendCount An integer(u64) identifying the number of the input data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
+ * @param comm A pointer identifying the communication resource based on.
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult
+ */
+extern HcclResult HcclAllGather(void *sendBuf, void *recvBuf, uint64_t sendCount, HcclDataType dataType, HcclComm comm,
+                                aclrtStream stream);
+
+/**
+ * @brief Destroy HCCL comm
+ *
+ * @param comm A pointer identifying the communication resource targetting
+ * @return HcclResult
+ * @see HcclCommInitClusterInfo()
+ */
+extern HcclResult HcclCommDestroy(HcclComm comm);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // HCCL_H_
diff --git a/inc/external/hccl/hccl_types.h b/inc/external/hccl/hccl_types.h
new file mode 100644
index 00000000..0d2b9ca5
--- /dev/null
+++ b/inc/external/hccl/hccl_types.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hccl_types.h
+ * @brief HCCL data type definition
+ *
+ */
+
+#ifndef HCCL_TYPES_H_
+#define HCCL_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/**
+ * @brief HCCL functions return value definition
+ */
+typedef enum {
+  HCCL_SUCCESS = 0,              /**< success */
+  HCCL_E_PARA = 1,               /**< parameter error */
+  HCCL_E_PTR = 2,                /**< empty pointer */
+  HCCL_E_MEMORY = 3,             /**< memory error */
+  HCCL_E_INTERNAL = 4,           /**< internal error */
+  HCCL_E_NOT_SUPPORT = 5,        /**< not support feature */
+  HCCL_E_NOT_FOUND = 6,          /**< not found specific resource */
+  HCCL_E_UNAVAIL = 7,            /**< resource unavailable */
+  HCCL_E_SYSCALL = 8,            /**< call system interface error */
+  HCCL_E_TIMEOUT = 9,            /**< timeout */
+  HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */
+  HCCL_E_TCP_CONNECT = 11,       /**< tcp connect fail */
+  HCCL_E_ROCE_CONNECT = 12,      /**< roce connect fail */
+  HCCL_E_TCP_TRANSFER = 13,      /**< tcp transfer fail */
+  HCCL_E_ROCE_TRANSFER = 14,     /**< roce transfer fail */
+  HCCL_E_RUNTIME = 15,           /**< call runtime api fail */
+  HCCL_E_DRV = 16,               /**< call driver api fail */
+  HCCL_E_PROFILING = 17,         /**< call profiling api fail */
+  HCCL_E_CCE = 18,               /**< call cce api fail */
+  HCCL_E_NETWORK = 19,           /**< call network api fail */
+  HCCL_E_RESERVED                /**< reserved */
+} HcclResult;
+
+/**
+ * @brief handle to HCCL communicator
+ */
+typedef void *HcclComm;
+
+/**
+ * @brief HCCL Reduction opperation
+ */
+typedef enum {
+  HCCL_REDUCE_SUM = 0,  /**< sum */
+  HCCL_REDUCE_PROD = 1, /**< prod */
+  HCCL_REDUCE_MAX = 2,  /**< max */
+  HCCL_REDUCE_MIN = 3,  /**< min */
+  HCCL_REDUCE_RESERVED  /**< reserved */
+} HcclReduceOp;
+
+/**
+ * @brief HCCL data type
+ */
+typedef enum {
+  HCCL_DATA_TYPE_INT8 = 0,   /**< int8 */
+  HCCL_DATA_TYPE_INT16 = 1,  /**< int16 */
+  HCCL_DATA_TYPE_INT32 = 2,  /**< int32 */
+  HCCL_DATA_TYPE_FP16 = 3,   /**< fp16 */
+  HCCL_DATA_TYPE_FP32 = 4,   /**< fp32 */
+  HCCL_DATA_TYPE_INT64 = 5,  /**< int64 */
+  HCCL_DATA_TYPE_UINT64 = 6, /**< uint64 */
+  HCCL_DATA_TYPE_RESERVED    /**< reserved */
+} HcclDataType;
+
+const uint32_t HCCL_ROOT_INFO_BYTES = 4108;  // 4108: root info length
+
+/**
+ * @brief HCCL root info
+ */
+typedef struct HcclRootInfoDef {
+  char internal[HCCL_ROOT_INFO_BYTES];
+} HcclRootInfo;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // HCCL_TYPES_H_
diff --git a/inc/external/register/register.h b/inc/external/register/register.h
index f3091fae..f9056171 100644
--- a/inc/external/register/register.h
+++ b/inc/external/register/register.h
@@ -40,7 +40,6 @@ using std::to_string;
 using std::unique_ptr;
 using std::vector;
 
-/*lint -e148*/
 namespace ge {
 class Operator;
 class TensorDesc;
@@ -159,5 +158,4 @@ namespace ge {
 using OpRegistrationData = domi::OpRegistrationData;
 using OpReceiver = domi::OpReceiver;
 }  // namespace ge
-/*lint +e148*/
 #endif  // INC_EXTERNAL_REGISTER_REGISTER_H_
diff --git a/inc/external/register/scope/scope_fusion_pass_register.h b/inc/external/register/scope/scope_fusion_pass_register.h
index 8e5605a7..c2905927 100644
--- a/inc/external/register/scope/scope_fusion_pass_register.h
+++ b/inc/external/register/scope/scope_fusion_pass_register.h
@@ -301,7 +301,6 @@ class GE_FUNC_HOST_VISIBILITY GE_FUNC_DEV_VISIBILITY ScopeFusionPassRegistry {
  private:
   ScopeFusionPassRegistry();
   class ScopeFusionPassRegistryImpl;
-  /*lint -e148*/
   std::unique_ptr<ScopeFusionPassRegistryImpl> impl_;
   friend class TensorFlowModelParser;
 };
diff --git a/inc/framework/common/ge_inner_error_codes.h b/inc/framework/common/ge_inner_error_codes.h
index 3ab6cf06..ccc8c753 100644
--- a/inc/framework/common/ge_inner_error_codes.h
+++ b/inc/framework/common/ge_inner_error_codes.h
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-/*lint -e* */
 #ifndef INC_FRAMEWORK_COMMON_GE_INNER_ERROR_CODES_H_
 #define INC_FRAMEWORK_COMMON_GE_INNER_ERROR_CODES_H_
 
@@ -304,6 +303,7 @@ GE_ERRORNO_EXECUTOR(GE_EXEC_ALLOC_WEIGHT_MEM_FAILED, 16, "Failed to allocate wei
 GE_ERRORNO_EXECUTOR(GE_EXEC_ALLOC_VAR_MEM_FAILED, 17, "Failed to allocate variable memory.");
 GE_ERRORNO_EXECUTOR(GE_AIPP_NOT_EXIST, 18, "GE AIPP is not exist.");
 GE_ERRORNO_EXECUTOR(GE_DYNAMIC_AIPP_NOT_SUPPORT_QUERY, 19, "GE Dynamic AIPP is not support to query temporarily.");
+GE_ERRORNO_EXECUTOR(GE_EXEC_ALLOC_P2P_MEM_FAILED, 20, "Failed to allocate P2P memory");
 
 // Generator module error code definition
 GE_ERRORNO_GENERATOR(GE_GENERATOR_GRAPH_MANAGER_INIT_FAILED, 1, "Graph manager initialize failed.");
diff --git a/inc/framework/common/op/attr_value_util.h b/inc/framework/common/op/attr_value_util.h
index 8a90cfa2..3b242d42 100644
--- a/inc/framework/common/op/attr_value_util.h
+++ b/inc/framework/common/op/attr_value_util.h
@@ -21,7 +21,6 @@
 #include <unordered_map>
 #include <string>
 
-#include "common/types.h"
 #include "graph/debug/ge_attr_define.h"
 #include "proto/om.pb.h"
 
diff --git a/inc/framework/common/op/ge_op_utils.h b/inc/framework/common/op/ge_op_utils.h
index 87cf54d8..4718b180 100644
--- a/inc/framework/common/op/ge_op_utils.h
+++ b/inc/framework/common/op/ge_op_utils.h
@@ -22,7 +22,8 @@
 #include <vector>
 
 #include "common/op/attr_value_util.h"
-#include "common/types.h"
+#include "register/register_types.h"
+#include "register/register_error_codes.h"
 #include "common/util.h"
 #include "graph/attr_value.h"
 #include "graph/ge_tensor.h"
diff --git a/inc/framework/common/string_util.h b/inc/framework/common/string_util.h
index 3e4bf093..b74eddcf 100644
--- a/inc/framework/common/string_util.h
+++ b/inc/framework/common/string_util.h
@@ -36,8 +36,8 @@ class StringUtils {
 #endif
     return s;
   }
-  // lint -esym(551,*)
-  static std::string &Rtrim(std::string &s) { /*lint !e618*/
+
+  static std::string &Rtrim(std::string &s) {
 #if __cplusplus >= 201103L
     (void)s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) { return !std::isspace(c); }));
 #else
@@ -45,7 +45,7 @@ class StringUtils {
 #endif
     return s;
   }
-  // lint -esym(551,*)
+
   ///
   ///  @ingroup domi_common
   ///  @brief delete spaces at the beginning and end of a string
@@ -61,10 +61,8 @@ class StringUtils {
   ///  @param [in] delim  separator
   ///  @return string array after segmentation
   ///
-  /*lint -e1077*/
   static std::vector<std::string> Split(const std::string &str, char delim) {
     std::vector<std::string> elems;
-    /*lint +e1077*/
 
     if (str.empty()) {
       elems.emplace_back("");
diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h
index ad284d07..c9400628 100644
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -434,6 +434,7 @@ REGISTER_OPTYPE_DECLARE(HCOMREDUCESCATTER, "HcomReduceScatter");
 REGISTER_OPTYPE_DECLARE(HCOMSEND, "HcomSend");
 REGISTER_OPTYPE_DECLARE(HCOMRECEIVE, "HcomReceive");
 REGISTER_OPTYPE_DECLARE(HCOMREMOTEREAD, "HcomRemoteRead");
+REGISTER_OPTYPE_DECLARE(HCOMREMOTEREFREAD, "HcomRemoteRefRead");
 REGISTER_OPTYPE_DECLARE(HCOMREMOTEWRITE, "HcomRemoteWrite");
 
 REGISTER_OPTYPE_DECLARE(VARASSIGN, "VarAssign");
diff --git a/inc/framework/common/util.h b/inc/framework/common/util.h
index b1c278d8..7e6c9c68 100644
--- a/inc/framework/common/util.h
+++ b/inc/framework/common/util.h
@@ -345,7 +345,7 @@ std::string ToString(const google::protobuf::RepeatedField<T> &rpd_field) {
 /// @return Timestamp, in microseconds (US)
 ///
 ///
-uint64_t GetCurrentTimestap();
+uint64_t GetCurrentTimestamp();
 
 ///
 /// @ingroup domi_common
diff --git a/inc/framework/engine/dnnengine.h b/inc/framework/engine/dnnengine.h
index 65897ac5..1bcf5e07 100644
--- a/inc/framework/engine/dnnengine.h
+++ b/inc/framework/engine/dnnengine.h
@@ -30,6 +30,7 @@ enum PriorityEnum {
   COST_0 = 0,
   COST_1,
   COST_2,
+  COST_3,
   COST_9 = 9,
   COST_10 = 10,
 };
diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h
index 37bca897..c446b983 100644
--- a/inc/framework/generator/ge_generator.h
+++ b/inc/framework/generator/ge_generator.h
@@ -86,6 +86,7 @@ class GeGenerator {
   Status BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                        const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
                        bool is_offline = true);
+  Status CheckForSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs);
 
   class Impl;
 
diff --git a/inc/framework/memory/memory_api.h b/inc/framework/memory/memory_api.h
index ebb7e68c..7c87fe74 100644
--- a/inc/framework/memory/memory_api.h
+++ b/inc/framework/memory/memory_api.h
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "ge/ge_api_error_codes.h"
+#include "graph//types.h"
 #include "runtime/mem.h"
 
 namespace ge {
@@ -35,6 +36,12 @@ struct HostVarInfo {
   uint64_t var_size;
 };
 
+struct TensorInfo {
+  std::string var_name;
+  std::vector<int64_t> dims;
+  DataType data_type;
+};
+
 ///
 /// \param size [in] rdma pool memory size to be allocated.
 /// \param mem_type [in] memory type for rdma pool.
@@ -47,6 +54,13 @@ Status InitRdmaPool(size_t size, rtMemType_t mem_type = RT_MEMORY_HBM);
 /// \return Status result of function
 Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t mem_type = RT_MEMORY_HBM);
 
+///
+/// \param tensor_info [in] description for tensor stored shared memory.
+/// \param dev_addr [out] malloced shared memory addr.
+/// \param memory_size [out] malloced shared memory size.
+/// \return Status result of function
+Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size);
+
 ///
 /// \param var_name [in] var_name name of host variable.
 /// \param base_addr [out] base_addr vase addr of host variable.
diff --git a/inc/framework/memory/memory_assigner.h b/inc/framework/memory/memory_assigner.h
index bbec014b..4552fa7c 100644
--- a/inc/framework/memory/memory_assigner.h
+++ b/inc/framework/memory/memory_assigner.h
@@ -33,7 +33,7 @@ class MemoryAssigner {
 
   MemoryAssigner &operator=(const MemoryAssigner &) = delete;
 
-  Status AssignMemory(bool is_loop_graph, size_t &mem_offset, size_t &zero_copy_mem_size);
+  Status AssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_offset, size_t &zero_copy_mem_size);
 
  private:
   ge::ComputeGraphPtr compute_graph_;
diff --git a/inc/framework/omg/omg.h b/inc/framework/omg/omg.h
index 45a8896d..71f94c98 100644
--- a/inc/framework/omg/omg.h
+++ b/inc/framework/omg/omg.h
@@ -21,7 +21,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "framework/common/types.h"
 #include "framework/omg/omg_inner_types.h"
 #include "framework/omg/parser/parser_inner_ctx.h"
 #include "proto/ge_ir.pb.h"
@@ -92,8 +91,6 @@ void GetGroupName(ge::proto::ModelDef &model);
 
 void FindParserSo(const string &path, vector<string> &fileList, string &caffe_parser_path);
 
-Status CheckCustomAiCpuOpLib();
-
 Status DumpInfershapeJson(const ge::Graph &graph, const char *json_file);
 
 Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const std::string &output_format);
diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h
index e1a7da0b..c48d1649 100644
--- a/inc/framework/omg/omg_inner_types.h
+++ b/inc/framework/omg/omg_inner_types.h
@@ -25,7 +25,6 @@
 #include <utility>
 #include <vector>
 #include "framework/common/fmk_error_codes.h"
-#include "framework/common/types.h"
 #include "register/register_fmk_types.h"
 
 using domi::DOMI_TENSOR_ND;
@@ -92,6 +91,8 @@ struct OmgContext {
   std::map<std::string, std::vector<int32_t>> out_nodes_map;
   // user-designate out nodes (this is used for determing the orders)
   std::vector<std::pair<std::string, int32_t>> user_out_nodes;
+  // default out nodes (this is used for determing the orders)
+  std::vector<std::pair<std::string, int32_t>> default_out_nodes;
   // save the output node of the network, value = topName,
   // topName indicates the output name of the operator.
   std::vector<std::string> user_out_nodes_top_vec;
@@ -99,8 +100,6 @@ struct OmgContext {
   std::vector<std::string> net_out_nodes;
   // net out nodes top names(only caffe has top)
   std::vector<std::string> out_top_names;
-  // path for the aicpu custom operator so_file
-  std::vector<std::string> aicpu_op_run_paths;
   // preferential format used by the entire network
   domiTensorFormat_t net_format = DOMI_TENSOR_RESERVED;
   domi::FrameworkType type = domi::FRAMEWORK_RESERVED;
diff --git a/inc/graph/buffer.h b/inc/graph/buffer.h
index ca4355a7..e6be3daa 100644
--- a/inc/graph/buffer.h
+++ b/inc/graph/buffer.h
@@ -57,11 +57,11 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Buffer {
 
   // For compatibility
   inline const std::uint8_t *data() const { return GetData(); }
-  inline std::uint8_t *data() { return GetData(); }  // lint !e659
+  inline std::uint8_t *data() { return GetData(); }
   inline std::size_t size() const { return GetSize(); }
   inline void clear() { return ClearBuffer(); }
-  uint8_t operator[](size_t index) const {                // lint !e1022 !e1042
-    if (buffer_ != nullptr && index < buffer_->size()) {  // lint !e574
+  uint8_t operator[](size_t index) const {
+    if (buffer_ != nullptr && index < buffer_->size()) {
       return (uint8_t)(*buffer_)[index];
     }
     return 0xff;
diff --git a/inc/graph/compute_graph.h b/inc/graph/compute_graph.h
index 2ec6b663..9a454f39 100644
--- a/inc/graph/compute_graph.h
+++ b/inc/graph/compute_graph.h
@@ -84,7 +84,6 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
 
   NodePtr FindNode(const std::string &name) const;
   NodePtr FindFirstNodeMatchType(const std::string &name) const;
-  /*lint -e504*/
   // AddNode with NodePtr
   NodePtr AddNode(NodePtr node);
   NodePtr AddNode(OpDescPtr op);
@@ -152,7 +151,6 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
   graphStatus InsertEventNodes();
   bool operator==(const ComputeGraph &r_compute_graph) const;
 
-  /*lint +e504*/
   const std::map<std::vector<std::string>, std::vector<std::string>> &GetShareParamLayer() const {
     return params_share_map_;
   }
diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h
index 47b11ba8..d0335eb8 100644
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-/*lint -e618*/
 #ifndef INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
 #define INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
 
@@ -33,6 +32,8 @@ namespace ge {
 #define GE_FUNC_DEV_VISIBILITY
 #endif
 // Public attribute
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FORCE_UNKNOWN_SHAPE;
+
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IS_UNKNOWN_SHAPE;
 
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED;
@@ -1021,8 +1022,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSION_GROUP_KEY;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSION_VIRTUAL_OP;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSION_GROUP_TYPE;
-GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_MEM_TYPE_LIST;
-GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_MEM_TYPE_LIST;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L1_FUSION_EXTEND_PTR;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_GET_TENSOR_ACTUAL_SIZE;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_OFFSET_FOR_L1_FUSION;
@@ -1044,6 +1043,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TBE_KERNEL_NAME;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TBE_KERNEL_BUFFER;
 
+// used for memory allocate
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_MEM_TYPE_LIST;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_MEM_TYPE_LIST;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WORKSPACE_TYPE_LIST;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TENSOR_MEM_TYPE;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_P2P_MEMORY_SIZE;
+
 // for unregistered op
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_UNREGST_OPPATH;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_UNREGST_ATTRLIST;
@@ -1121,10 +1127,12 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_VAR
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_INPUT_MEMORY_TYPE;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_OUTPUT_MEMORY_TYPE;
 
+// stage
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_STAGE_LEVEL;
+
 // input_output_offset
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_ZERO_COPY_BASIC_OFFSET;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_ZERO_COPY_RELATIVE_OFFSET;
 }  // namespace ge
 
 #endif  // INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
-/*lint +e618*/
diff --git a/inc/graph/detail/any_map.h b/inc/graph/detail/any_map.h
index 70533ea1..21eb08b0 100644
--- a/inc/graph/detail/any_map.h
+++ b/inc/graph/detail/any_map.h
@@ -38,7 +38,7 @@ class TypeID {
   bool operator==(const TypeID &__arg) const { return type_ == __arg.type_; }
 
  private:
-  explicit TypeID(string type) : type_(std::move(type)) {}  // lint !e30 !e32
+  explicit TypeID(string type) : type_(std::move(type)) {}
 
   string type_;
 };
diff --git a/inc/graph/detail/attributes_holder.h b/inc/graph/detail/attributes_holder.h
index 49741143..cdaec821 100644
--- a/inc/graph/detail/attributes_holder.h
+++ b/inc/graph/detail/attributes_holder.h
@@ -50,7 +50,7 @@ class OpDef;
 class GraphDef;
 }  // namespace proto
 
-using ProtoAttrMap = ::google::protobuf::Map<::std::string, ::ge::proto::AttrDef>;  // lint !e1073
+using ProtoAttrMap = ::google::protobuf::Map<::std::string, ::ge::proto::AttrDef>;
 using ProtoMsgOwner = std::shared_ptr<::google::protobuf::Message>;
 
 template <class ProtoType>
@@ -147,7 +147,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY AttrHolder {
  protected:
   graphStatus AddRequiredAttr(const std::string &name);
   const std::unordered_set<string> GetAllAttrNames() const;
-  const std::map<string, GeAttrValue> GetAllAttrs() const;  // lint !e1073
+  const std::map<string, GeAttrValue> GetAllAttrs() const;
 
   virtual ProtoAttrMapHelper MutableAttrMap() = 0;
   virtual ConstProtoAttrMapHelper GetAttrMap() const = 0;
diff --git a/inc/graph/ge_attr_value.h b/inc/graph/ge_attr_value.h
index 0c265c20..b665beba 100644
--- a/inc/graph/ge_attr_value.h
+++ b/inc/graph/ge_attr_value.h
@@ -310,7 +310,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeAttrValue {
   VALUE_SET_GET_DEC(GeAttrValue::GRAPH)
   VALUE_SET_GET_DEC(BYTES)
   VALUE_SET_GET_DEC(NamedAttrs)
-  VALUE_SET_GET_DEC(ge::DataType)  // lint !e665
+  VALUE_SET_GET_DEC(ge::DataType)
   VALUE_SET_GET_DEC(vector<GeAttrValue::STR>)
   VALUE_SET_GET_DEC(vector<GeAttrValue::INT>)
   VALUE_SET_GET_DEC(vector<GeAttrValue::FLOAT>)
@@ -320,8 +320,8 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeAttrValue {
   VALUE_SET_GET_DEC(vector<GeAttrValue::GRAPH>)
   VALUE_SET_GET_DEC(vector<GeAttrValue::BYTES>)
   VALUE_SET_GET_DEC(vector<NamedAttrs>)
-  VALUE_SET_GET_DEC(vector<vector<int64_t>>)  // lint !e665
-  VALUE_SET_GET_DEC(vector<ge::DataType>)     // lint !e665
+  VALUE_SET_GET_DEC(vector<vector<int64_t>>)
+  VALUE_SET_GET_DEC(vector<ge::DataType>)
 #undef VALUE_SET_GET_DEC
 
   GeIrProtoHelper<proto::AttrDef> value_;
diff --git a/inc/graph/ge_context.h b/inc/graph/ge_context.h
index 53985e9c..a20bbbe3 100644
--- a/inc/graph/ge_context.h
+++ b/inc/graph/ge_context.h
@@ -33,7 +33,7 @@ class GEContext {
   void SetCtxDeviceId(uint32_t device_id);
 
  private:
-  uint64_t session_id_ = 0;
+  thread_local static uint64_t session_id_;
   uint32_t device_id_ = 0;
   uint64_t trace_id_ = 0;
 };  // class GEContext
diff --git a/inc/graph/ge_local_context.h b/inc/graph/ge_local_context.h
index b47098fb..58efe37b 100644
--- a/inc/graph/ge_local_context.h
+++ b/inc/graph/ge_local_context.h
@@ -33,6 +33,11 @@ class GEThreadLocalContext {
   void SetSessionOption(map<std::string, string> options_map);
   void SetGlobalOption(map<std::string, string> options_map);
 
+  map<string, string> GetAllGraphOptions() const;
+  map<string, string> GetAllSessionOptions() const;
+  map<string, string> GetAllGlobalOptions() const;
+  map<string, string> GetAllOptions() const;
+
  private:
   map<string, string> graph_options_;
   map<string, string> session_options_;
diff --git a/inc/graph/node.h b/inc/graph/node.h
index f4a1c6a8..e1ffe5b7 100644
--- a/inc/graph/node.h
+++ b/inc/graph/node.h
@@ -193,7 +193,7 @@ class Node : public std::enable_shared_from_this<Node> {
   vector<OutDataAnchorPtr> out_data_anchors_;
   InControlAnchorPtr in_control_anchor_;
   OutControlAnchorPtr out_control_anchor_;
-  map<string, GeAttrValue> attrs_;  // lint !e1073
+  map<string, GeAttrValue> attrs_;
   bool has_init_{false};
   bool host_node_{false};
   bool anchor_status_updated_{false};
diff --git a/inc/graph/range_vistor.h b/inc/graph/range_vistor.h
index 8635d413..20905bd9 100644
--- a/inc/graph/range_vistor.h
+++ b/inc/graph/range_vistor.h
@@ -22,10 +22,8 @@
 template <class E, class O>
 class RangeVistor {
  public:
-  /*lint -e151*/
   using Iterator = typename std::vector<E>::iterator;
   using ConstIterator = typename std::vector<E>::const_iterator;
-  /*lint +e151*/
 
   RangeVistor(O owner, const std::vector<E> &vs) : owner_(owner), elements_(vs) {}
 
@@ -43,9 +41,7 @@ class RangeVistor {
 
   bool empty() const { return elements_.empty(); }
 
-  /*lint -e659*/
   E &at(std::size_t index) { return elements_.at(index); }
-  /*lint +e659*/
 
   const E &at(std::size_t index) const { return elements_.at(index); }
 
diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h
index fdcbe1a9..2933d034 100644
--- a/inc/graph/utils/graph_utils.h
+++ b/inc/graph/utils/graph_utils.h
@@ -19,18 +19,18 @@
 
 #include <fstream>
 #include <iostream>
+#include <list>
 #include <map>
 #include <string>
-#include <vector>
-#include <list>
 #include <unordered_map>
+#include <vector>
 
 #include "graph/anchor.h"
-#include "graph/node.h"
 #include "graph/compute_graph.h"
-#include "graph/utils/anchor_utils.h"
 #include "graph/graph.h"
 #include "graph/model.h"
+#include "graph/node.h"
+#include "graph/utils/anchor_utils.h"
 
 #define GE_DUMP(compute_graph, name)                                                                   \
   do {                                                                                                 \
@@ -206,6 +206,8 @@ class GraphUtils {
   static void DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, bool is_always_dump = false,
                           const std::string &user_graph_name = "");
 
+  static void DumpGEGrph(const ge::ComputeGraphPtr &graph, const std::string &path, const std::string &suffix);
+
   static bool LoadGEGraph(const char *file, ge::ComputeGraph &compute_graph);
 
   static bool LoadGEGraph(const char *file, ge::ComputeGraphPtr &compute_graph);
@@ -214,6 +216,8 @@ class GraphUtils {
 
   static void DumpGEGraphToOnnx(const ge::ComputeGraph &compute_graph, const std::string &suffix);
 
+  static void DumpGrphToOnnx(const ge::ComputeGraph &compute_graph, const std::string &path, const std::string &suffix);
+
   static bool LoadGEGraphFromOnnx(const char *file, ge::ComputeGraph &compute_graph);
 
   static bool ReadProtoFromTextFile(const char *file, google::protobuf::Message *message);
@@ -559,7 +563,8 @@ class ComputeGraphBuilder {
 
 class CompleteGraphBuilder : public ComputeGraphBuilder {
  public:
-  explicit CompleteGraphBuilder(std::string name) : name_(std::move(name)), parent_node_(nullptr) {}
+  explicit CompleteGraphBuilder(std::string name, bool retval_flag = true)
+      : name_(std::move(name)), parent_node_(nullptr), retval_flag_(retval_flag) {}
   CompleteGraphBuilder(const CompleteGraphBuilder &) = delete;
   CompleteGraphBuilder &operator=(const CompleteGraphBuilder &) = delete;
   CompleteGraphBuilder(const CompleteGraphBuilder &&) = delete;
@@ -687,8 +692,37 @@ class CompleteGraphBuilder : public ComputeGraphBuilder {
   ///
   void BuildGraphTargets(graphStatus &error_code, std::string &error_msg);
 
+  ///
+  /// @brief Add NetOutput node
+  /// @param [out] error_code
+  /// @param [out] error_msg
+  /// @return void
+  ///
+  void AddNetOutputNode(graphStatus &error_code, std::string &error_msg);
+
+  ///
+  /// @brief Build NetOutput nodes with data & ctrl edges
+  /// @param [in] net_output_desc
+  /// @param [in] peer_out_anchors
+  /// @param [out] error_code
+  /// @param [out] error_msg
+  /// @return void
+  ///
+  void BuildNetOutputNodeWithLink(const OpDescPtr &net_output_desc,
+                                  const std::vector<OutDataAnchorPtr> &peer_out_anchors, graphStatus &error_code,
+                                  std::string &error_msg);
+
+  ///
+  /// @brief process after build
+  /// @param [out] error_code
+  /// @param [out] error_msg
+  /// @return void
+  ///
+  void PostProcess(graphStatus &error_code, std::string &error_msg);
+
   std::string name_;
   NodePtr parent_node_;
+  bool retval_flag_;
   std::map<uint32_t, std::pair<std::vector<std::string>, std::vector<uint32_t>>> graph_inputs_;
   std::vector<std::pair<std::string, uint32_t>> graph_outputs_;
   std::vector<std::string> graph_targets_;
diff --git a/inc/graph/utils/node_adapter.h b/inc/graph/utils/node_adapter.h
new file mode 100644
index 00000000..2d44e52a
--- /dev/null
+++ b/inc/graph/utils/node_adapter.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_GRAPH_UTILS_NODE_ADAPTER_H_
+#define INC_GRAPH_UTILS_NODE_ADAPTER_H_
+
+#include "graph/gnode.h"
+#include "graph/node.h"
+
+namespace ge {
+using NodePtr = std::shared_ptr<Node>;
+class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY NodeAdapter {
+ public:
+  static GNode Node2GNode(const NodePtr &node);
+  static NodePtr GNode2Node(const GNode &node);
+  static GNodePtr Node2GNodePtr(const NodePtr &node);
+};
+}  // namespace ge
+#endif  // INC_GRAPH_UTILS_NODE_ADAPTER_H_
diff --git a/inc/graph/utils/node_utils.h b/inc/graph/utils/node_utils.h
index bf57148d..26b37f60 100644
--- a/inc/graph/utils/node_utils.h
+++ b/inc/graph/utils/node_utils.h
@@ -83,6 +83,7 @@ class NodeUtils {
   static std::string GetNodeType(const Node &node);
   static std::string GetNodeType(const NodePtr &node);
 
+  static std::vector<ComputeGraphPtr> GetAllSubgraphs(const Node &node);
   static ComputeGraphPtr GetSubgraph(const Node &node, uint32_t index);
   static graphStatus SetSubgraph(Node &node, uint32_t index, const ComputeGraphPtr &subgraph);
 
@@ -162,6 +163,13 @@ class NodeUtils {
 
   static graphStatus GetInputConstData(const Node &node, const string &dst_name, GeTensorPtr &ge_tensor);
 
+  ///
+  /// @brief Get node type in cross subgragh.
+  /// @param [in] node
+  /// @return type
+  ///
+  static std::string GetInConstNodeTypeCrossSubgraph(const ge::NodePtr &node);
+
  private:
   static std::map<NodePtr, std::vector<uint32_t>> map_send_info_;
   static std::map<NodePtr, std::vector<uint32_t>> map_recv_info_;
diff --git a/inc/common/util/ai_core/param_calculate/aicore_param_calculator.h b/src/common/graph/ascend_string.cc
similarity index 62%
rename from inc/common/util/ai_core/param_calculate/aicore_param_calculator.h
rename to src/common/graph/ascend_string.cc
index c0c378fd..597b634f 100644
--- a/inc/common/util/ai_core/param_calculate/aicore_param_calculator.h
+++ b/src/common/graph/ascend_string.cc
@@ -14,20 +14,20 @@
  * limitations under the License.
  */
 
-#ifndef AICORE_PARAM_CALCULATOR
-#define AICORE_PARAM_CALCULATOR
+#include "external/graph/ascend_string.h"
 
-#include "graph/node.h"
-#include "graph_optimizer/graph_optimize_register_error_codes.h"
+namespace ge {
+AscendString::AscendString(const char* name) {
+  if (name != nullptr) {
+    name_ = std::shared_ptr<std::string>(new (std::nothrow) std::string(name));
+  }
+}
 
-namespace fe {
-class AICoreParamCalculator {
- public:
-  AICoreParamCalculator();
+const char* AscendString::GetString() const {
+  if (name_ == nullptr) {
+    return nullptr;
+  }
 
-  ~AICoreParamCalculator();
-
-  Status CalcOpRunningParam(ge::Node &node);
-};
-}  // namespace fe
-#endif  // AICORE_PARAM_CALCULATOR
+  return (*name_).c_str();
+}
+}  // namespace ge
diff --git a/src/common/graph/format_refiner.cc b/src/common/graph/format_refiner.cc
index c716825a..9a072849 100644
--- a/src/common/graph/format_refiner.cc
+++ b/src/common/graph/format_refiner.cc
@@ -41,6 +41,7 @@ using namespace ge;
 using namespace std;
 namespace ge {
 namespace {
+const size_t kDimSize4d = 4;
 const std::unordered_set<string> kChangeDimNodes = {PERMUTE, EXPANDDIMS, SQUEEZE};
 const string kIsGraphInferred = "_is_graph_inferred";
 thread_local RefRelations reflection_builder;
@@ -410,28 +411,26 @@ graphStatus FormatRefiner::DataNodeFormatProcess(const ComputeGraphPtr &graph, s
     GE_CHECK_NOTNULL(data_node);
     auto op_desc = data_node->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
-    GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(0));
-    auto curr_format = op_desc->GetOutputDescPtr(0)->GetOriginFormat();
+
+    auto input_desc = op_desc->MutableInputDesc(0);
+    auto output_desc = op_desc->MutableOutputDesc(0);
+    GE_CHECK_NOTNULL(input_desc);
+    GE_CHECK_NOTNULL(output_desc);
+
+    auto curr_format = output_desc->GetOriginFormat();
     if (curr_format != FORMAT_ND) {
       // Data format has been infered , continue
       continue;
     }
-    // Set format for un-infered data node
-    auto input_descs = op_desc->GetAllInputsDescPtr();
-    auto output_descs = op_desc->GetAllOutputsDescPtr();
-
-    for (const auto &input_desc : input_descs) {
-      if (input_desc != nullptr) {
-        input_desc->SetOriginFormat(data_format);
-        input_desc->SetFormat(data_format);
-      }
-    }
-    for (const auto &output_desc : output_descs) {
-      if (output_desc != nullptr) {
-        output_desc->SetOriginFormat(data_format);
-        output_desc->SetFormat(data_format);
-      }
+    // keep data format be ND because lacking of defination when input shape num is smaller than 4
+    if (input_desc->MutableShape().GetDimNum() < kDimSize4d) {
+      continue;
     }
+    // Set format for un-infered data node
+    input_desc->SetOriginFormat(data_format);
+    input_desc->SetFormat(data_format);
+    output_desc->SetOriginFormat(data_format);
+    output_desc->SetFormat(data_format);
     uninfered_data_nodes.push_back(data_node);
   }
   // Reinfer format from uninfered data nodes
diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc
index 9b723bb3..7b9f023f 100644
--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@@ -18,6 +18,8 @@
 
 namespace ge {
 // Public attribute
+const std::string ATTR_NAME_FORCE_UNKNOWN_SHAPE = "_force_unknown_shape";
+
 const std::string ATTR_NAME_IS_UNKNOWN_SHAPE = "_is_unknown_shape";
 
 const std::string ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED = "_dynamic_shape_partitioned";
@@ -718,6 +720,8 @@ const std::string ATTR_MODEL_MEMORY_SIZE = "memory_size";
 
 const std::string ATTR_MODEL_ZERO_COPY_MEMORY_SIZE = "zero_copy_memory_size";
 
+const std::string ATTR_MODEL_P2P_MEMORY_SIZE = "p2p_memory_size";
+
 const std::string ATTR_MODEL_OUT_NODES_NAME = "attr_model_out_nodes_name";
 
 const std::string ATTR_MODEL_WEIGHT_SIZE = "weight_size";
@@ -957,8 +961,6 @@ const std::string ATTR_NAME_FUSION_GROUP_KEY = "_fusion_group_key";
 const std::string ATTR_NAME_L1_FUSION_GROUP_KEY = "_l1_fusion_group_key";
 const std::string ATTR_NAME_FUSION_VIRTUAL_OP = "_fusion_virtual_op";
 const std::string ATTR_NAME_FUSION_GROUP_TYPE = "_fusion_group_type";
-const std::string ATTR_NAME_INPUT_MEM_TYPE_LIST = "_input_memory_type";
-const std::string ATTR_NAME_OUTPUT_MEM_TYPE_LIST = "_output_memory_type";
 const std::string ATTR_NAME_L1_FUSION_EXTEND_PTR = "_l1_fusion_extend_content";
 const std::string ATTR_NAME_GET_TENSOR_ACTUAL_SIZE = "_tensor_actual_size";
 const std::string ATTR_NAME_OUTPUT_OFFSET_FOR_L1_FUSION = "_output_offset_for_l1_fuison";
@@ -980,6 +982,12 @@ const std::string ATTR_NAME_OP_COMPILE_STRATEGY = "_op_compile_strategy";
 const std::string ATTR_NAME_TBE_KERNEL_NAME = "_tbe_kernel_name";
 const std::string ATTR_NAME_TBE_KERNEL_BUFFER = "_tbe_kernel_buffer";
 
+// used for memory allocate
+const std::string ATTR_NAME_INPUT_MEM_TYPE_LIST = "_input_memory_type";
+const std::string ATTR_NAME_OUTPUT_MEM_TYPE_LIST = "_output_memory_type";
+const std::string ATTR_NAME_WORKSPACE_TYPE_LIST = "_workspace_type";
+const std::string ATTR_NAME_TENSOR_MEM_TYPE = "_tensor_memory_type";
+
 // Op debug attrs
 const std::string ATTR_OP_DEBUG_FLAG = "_op_debug_flag";
 const std::string ATTR_OP_DEBUG_MODE = "_op_debug_mode";
@@ -1080,6 +1088,9 @@ const std::string ATTR_VARIABLE_PLACEMENT = "_variable_placement";
 const std::string ATTR_INPUT_MEMORY_TYPE = "_input_memory_type";
 const std::string ATTR_OUTPUT_MEMORY_TYPE = "_output_memory_type";
 
+// stage
+const std::string ATTR_STAGE_LEVEL = "_stage_level";
+
 // input_output_offset
 const std::string ATTR_ZERO_COPY_BASIC_OFFSET = "_zero_copy_basic_offset";
 const std::string ATTR_ZERO_COPY_RELATIVE_OFFSET = "_zero_copy_relative_offset";
diff --git a/src/common/graph/ge_attr_value.cc b/src/common/graph/ge_attr_value.cc
index a8490470..8a62134f 100644
--- a/src/common/graph/ge_attr_value.cc
+++ b/src/common/graph/ge_attr_value.cc
@@ -33,8 +33,7 @@ using std::vector;
 namespace ge {
 NamedAttrs::NamedAttrs() { named_attrs_.InitDefault(); }
 
-NamedAttrs::NamedAttrs(const ProtoMsgOwner &owner, proto::NamedAttrs *proto_msg)
-    : named_attrs_(owner, proto_msg) {}  // lint !e1744
+NamedAttrs::NamedAttrs(const ProtoMsgOwner &owner, proto::NamedAttrs *proto_msg) : named_attrs_(owner, proto_msg) {}
 
 void NamedAttrs::SetName(const std::string &name) {
   auto proto_msg = named_attrs_.GetProtoMsg();
@@ -239,7 +238,7 @@ ATTR_VALUE_SET_GET_IMP(GeAttrValue::STR)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::STR>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::INT)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::INT>)
-ATTR_VALUE_SET_GET_IMP(GeAttrValue::FLOAT)  // lint !e524
+ATTR_VALUE_SET_GET_IMP(GeAttrValue::FLOAT)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::FLOAT>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::BOOL)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::BOOL>)
@@ -253,11 +252,9 @@ ATTR_VALUE_SET_GET_IMP(GeAttrValue::BYTES)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::BYTES>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::NAMED_ATTRS)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::NAMED_ATTRS>)
-/*lint -e665*/
 ATTR_VALUE_SET_GET_IMP(vector<vector<int64_t>>)
-/*lint +e665*/
-ATTR_VALUE_SET_GET_IMP(vector<DataType>)        // lint !e665
-ATTR_VALUE_SET_GET_IMP(GeAttrValue::DATA_TYPE)  // lint !e665
+ATTR_VALUE_SET_GET_IMP(vector<DataType>)
+ATTR_VALUE_SET_GET_IMP(GeAttrValue::DATA_TYPE)
 
 #undef ATTR_VALUE_SET_GET_IMP
 
@@ -785,14 +782,14 @@ bool GeAttrValueImp::GetValue(const proto::AttrDef &proto_attr_val, const ProtoM
   if (graph_def == nullptr) {
     GELOGE(GRAPH_FAILED, "proto::GraphDef make shared failed");
     graph_def = nullptr;
-    return false;  // lint !e665
+    return false;
   } else {
     ModelSerializeImp imp;
     imp.SetProtobufOwner(graph_def);
     if (!imp.UnserializeGraph(graph, *graph_def)) {
       GELOGE(GRAPH_FAILED, "UnserializeGraph Failed");
       return false;
-    }  // lint !e514
+    }
     value = graph;
   }
   return true;
@@ -812,7 +809,7 @@ bool GeAttrValueImp::GetValue(const proto::AttrDef &proto_attr_val, const ProtoM
     if (graph_def == nullptr) {
       GELOGE(GRAPH_FAILED, "proto::GraphDef make shared failed");
       graph_def = nullptr;
-      return false;  // lint !e665
+      return false;
     } else {
       ComputeGraphPtr graph = nullptr;
       ModelSerializeImp imp;
@@ -820,7 +817,7 @@ bool GeAttrValueImp::GetValue(const proto::AttrDef &proto_attr_val, const ProtoM
       if (!imp.UnserializeGraph(graph, *graph_def)) {
         GELOGE(GRAPH_FAILED, "UnserializeGraph Failed");
         return false;
-      }  // lint !e514
+      }
       value.push_back(graph);
     }
   }
@@ -972,9 +969,7 @@ ATTR_UTILS_SET_IMP(Tensor, GeTensor)
 ATTR_UTILS_SET_GET_IMP(NamedAttrs, GeAttrValue::NAMED_ATTRS)
 ATTR_UTILS_SET_GET_IMP(Bytes, Buffer)
 ATTR_UTILS_SET_GET_IMP(Graph, ComputeGraphPtr)
-/*lint -e665*/
 ATTR_UTILS_SET_GET_IMP(ListListInt, vector<vector<int64_t>>)
-/*lint +e665*/
 
 ATTR_UTILS_SET_GET_IMP(ListInt, vector<int64_t>)
 ATTR_UTILS_SET_IMP(ListInt, vector<int32_t>)
@@ -989,8 +984,8 @@ ATTR_UTILS_SET_IMP(ListTensor, vector<GeTensor>)
 ATTR_UTILS_SET_GET_IMP(ListNamedAttrs, vector<GeAttrValue::NAMED_ATTRS>)
 ATTR_UTILS_SET_GET_IMP(ListBytes, vector<Buffer>)
 ATTR_UTILS_SET_GET_IMP(ListGraph, vector<ComputeGraphPtr>)
-ATTR_UTILS_SET_GET_IMP(ListDataType, vector<ge::DataType>)  // lint !e665
-ATTR_UTILS_SET_GET_IMP(DataType, ge::DataType)              // lint !e665
+ATTR_UTILS_SET_GET_IMP(ListDataType, vector<ge::DataType>)
+ATTR_UTILS_SET_GET_IMP(DataType, ge::DataType)
 
 bool AttrUtils::SetListTensor(AttrHolderAdapter &&obj, const string &name,
                               std::initializer_list<ConstGeTensorPtr> &&value) {
@@ -1159,7 +1154,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool AttrUtils::GetListOpDesc(Con
   }
   for (const auto &item : bytes_vals) {
     ModelSerialize serialize;
-    auto op_desc = serialize.UnserializeOpDesc(item.GetData(), item.GetSize());  // lint !e732
+    auto op_desc = serialize.UnserializeOpDesc(item.GetData(), item.GetSize());
     value.push_back(op_desc);
   }
   return true;
@@ -1211,7 +1206,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr AttrUtils::CloneOpDesc(
   op_def = ComGraphMakeShared<proto::OpDef>();
   if (op_def == nullptr) {
     GELOGE(GRAPH_FAILED, "proto::OpDef make shared failed");
-    return nullptr;  // lint !e665
+    return nullptr;
   }
   ModelSerializeImp imp;
   (void)imp.SerializeOpDesc(org_op_desc, op_def.get());
diff --git a/src/common/graph/gnode.cc b/src/common/graph/gnode.cc
new file mode 100644
index 00000000..33450a5c
--- /dev/null
+++ b/src/common/graph/gnode.cc
@@ -0,0 +1,857 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/gnode.h"
+
+#include <utility>
+#include "debug/ge_util.h"
+#include "framework/common/debug/ge_log.h"
+#include "graph/anchor.h"
+#include "graph/node.h"
+#include "graph/utils/node_adapter.h"
+#include "graph/utils/tensor_adapter.h"
+#include <graph/utils/graph_utils.h>
+#include "graph/debug/ge_attr_define.h"
+#include "utils/node_utils.h"
+#include "utils/op_desc_utils.h"
+
+namespace ge {
+class NodeImpl {
+ public:
+  NodeImpl() = default;
+  ~NodeImpl() = default;
+
+  NodeImpl(NodeImpl &) = delete;
+  NodeImpl &operator=(const NodeImpl &) = delete;
+
+  std::weak_ptr<Node> node_ptr_;
+};
+
+NodePtr NodeAdapter::GNode2Node(const ge::GNode &graph_node) {
+  if (graph_node.impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GNode2Node: gnode impl is nullptr.");
+    return nullptr;
+  }
+
+  return graph_node.impl_->node_ptr_.lock();
+}
+
+GNode NodeAdapter::Node2GNode(const ge::NodePtr &node) {
+  if (node == nullptr) {
+    GELOGE(GRAPH_FAILED, "Node2GNode: node is nullptr");
+    return GNode();
+  }
+
+  GNode graph_node;
+  if (graph_node.impl_ == nullptr) {
+    GELOGW("Node2GNode: gnode impl is nullptr, node[%s].", node->GetName().c_str());
+    return graph_node;
+  }
+  graph_node.impl_->node_ptr_ = node;
+
+  return graph_node;
+}
+
+GNodePtr NodeAdapter::Node2GNodePtr(const ge::NodePtr &node) {
+  if (node == nullptr) {
+    GELOGE(GRAPH_FAILED, "Node2GNodePtr: node is nullptr");
+    return nullptr;
+  }
+
+  GNodePtr gnode = std::shared_ptr<GNode>(new (std::nothrow) GNode());
+  if (gnode == nullptr) {
+    GELOGE(GRAPH_FAILED, "Node2GNodePtr: gnode is nullptr, node[%s].", node->GetName().c_str());
+    return nullptr;
+  }
+
+  if (gnode->impl_ == nullptr) {
+    GELOGW("Node2GNode: gnode impl is nullptr, node[%s].", node->GetName().c_str());
+    return nullptr;
+  }
+  gnode->impl_->node_ptr_ = node;
+
+  return gnode;
+}
+
+GNode::GNode() { impl_ = ComGraphMakeShared<NodeImpl>(); }
+
+graphStatus GNode::GetType(ge::AscendString &type) const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetType: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetType: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+  std::string node_type = node_ptr->GetType();
+  AscendString ascend_type(node_type.c_str());
+  type = ascend_type;
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetName(ge::AscendString &name) const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetName: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetName: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+  std::string node_name = node_ptr->GetName();
+  AscendString ascend_name(node_name.c_str());
+  name = ascend_name;
+
+  return GRAPH_SUCCESS;
+}
+
+std::pair<GNodePtr, int32_t> GNode::GetInDataNodesAndPortIndexs(const int32_t index) const {
+  pair<GNodePtr, int32_t> gnode_idx = {nullptr, 0xFF};
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "Gnode: node impl is nullptr.");
+    return gnode_idx;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "Gnode: the shared ptr is not valid.");
+    return gnode_idx;
+  }
+
+  auto in_anchor = node_ptr->GetInDataAnchor(index);
+  if (in_anchor == nullptr) {
+    GELOGE(GRAPH_FAILED, "Failed to get in data node of index[%d] from node[%s], the anchor does not exist", index,
+           node_ptr->GetName().c_str());
+    return gnode_idx;
+  }
+
+  auto out_anchor = in_anchor->GetPeerOutAnchor();
+  if (out_anchor == nullptr) {
+    GELOGE(GRAPH_FAILED, "Failed to get in data node of index[%d] from node [%s], the data input does not exist", index,
+           node_ptr->GetName().c_str());
+    return gnode_idx;
+  }
+
+  NodePtr peer_node_ptr = out_anchor->GetOwnerNode();
+  GNodePtr gnode = NodeAdapter::Node2GNodePtr(peer_node_ptr);
+  if (gnode == nullptr) {
+    GELOGE(GRAPH_FAILED, "Peer node of node[%s] to gnode faild.", node_ptr->GetName().c_str());
+    return gnode_idx;
+  }
+
+  return {gnode, out_anchor->GetIdx()};
+}
+
+std::vector<GNodePtr> GNode::GetInControlNodes() const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "Gnode: node impl is nullptr.");
+    return {};
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "Gnode: the shared ptr is not valid.");
+    return {};
+  }
+
+  std::vector<GNodePtr> gnodes;
+  auto in_control_nodes = node_ptr->GetInControlNodes();
+  for (auto &in_control_node : in_control_nodes) {
+    GNodePtr gnode = NodeAdapter::Node2GNodePtr(in_control_node);
+    if (gnode == nullptr) {
+      GELOGE(GRAPH_FAILED, "In control_node of node[%s] to gnode faild.", node_ptr->GetName().c_str());
+      return {};
+    }
+    gnodes.emplace_back(gnode);
+  }
+
+  return gnodes;
+}
+
+std::vector<std::pair<GNodePtr, int32_t>> GNode::GetOutDataNodesAndPortIndexs(const int32_t index) const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "Gnode: node impl is nullptr.");
+    return {};
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "Gnode: the shared ptr is not valid.");
+    return {};
+  }
+
+  auto out_anchor = node_ptr->GetOutDataAnchor(index);
+  if (out_anchor == nullptr) {
+    GELOGE(GRAPH_FAILED, "Failed to get out data node of index %d from node %s, the anchor does not exists", index,
+           node_ptr->GetName().c_str());
+    return {};
+  }
+
+  vector<std::pair<GNodePtr, int32_t>> gnode_index;
+  auto in_data_anchors = out_anchor->GetPeerInDataAnchors();
+  for (auto &in_data_anchor : in_data_anchors) {
+    if (in_data_anchor == nullptr) {
+      GELOGE(GRAPH_FAILED, "In data anchor of node[%s] is nullptr.", node_ptr->GetName().c_str());
+      return {};
+    }
+    NodePtr peer_node_ptr = in_data_anchor->GetOwnerNode();
+    GNodePtr gnode = NodeAdapter::Node2GNodePtr(peer_node_ptr);
+    if (gnode == nullptr) {
+      GELOGE(GRAPH_FAILED, "Peer node of node[%s] to gnode faild.", node_ptr->GetName().c_str());
+      return {};
+    }
+    gnode_index.emplace_back(std::pair<GNodePtr, int32_t>(gnode, in_data_anchor->GetIdx()));
+  }
+
+  return gnode_index;
+}
+
+std::vector<GNodePtr> GNode::GetOutControlNodes() const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutControlNodes: node impl is nullptr.");
+    return {};
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutControlNodes: the node shared ptr is not valid.");
+    return {};
+  }
+
+  std::vector<GNodePtr> gnodes;
+  auto out_control_nodes = node_ptr->GetOutControlNodes();
+  for (auto &out_control_node : out_control_nodes) {
+    GNodePtr gnode = NodeAdapter::Node2GNodePtr(out_control_node);
+    if (gnode == nullptr) {
+      GELOGE(GRAPH_FAILED, "In control_node of node[%s] to gnode faild.", node_ptr->GetName().c_str());
+      return {};
+    }
+    gnodes.emplace_back(gnode);
+  }
+
+  return gnodes;
+}
+
+graphStatus GNode::GetInputConstData(const int32_t index, Tensor &data) const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputConstData: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputConstData: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr input_data_node = NodeUtils::GetInDataNodeByIndex(*node_ptr, index);
+  bool is_const = NodeUtils::IsConst(*input_data_node);
+  if (!is_const) {
+    GELOGE(GRAPH_NODE_WITHOUT_CONST_INPUT, "Node[%s] has no const input.", node_ptr->GetName().c_str());
+    return GRAPH_NODE_WITHOUT_CONST_INPUT;
+  }
+
+  Operator const_op = OpDescUtils::CreateOperatorFromNode(input_data_node);
+  if (const_op.GetAttr(ATTR_NAME_WEIGHTS, data) != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "Input data node[%s] of node[%s] get data failed.", input_data_node->GetName().c_str(),
+           node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetInputIndexByName(const ge::AscendString &name, int32_t &index) {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "GetInputIndexByName: ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputIndexByName: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputIndexByName: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  std::string node_name = ascend_name;
+  index = op_desc->GetInputIndexByName(node_name);
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetOutputIndexByName(const ge::AscendString &name, int32_t &index) {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "GetOutputIndexByName: ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutputIndexByName: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutputIndexByName: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  std::string node_name = ascend_name;
+  index = op_desc->GetOutputIndexByName(node_name);
+
+  return GRAPH_SUCCESS;
+}
+
+size_t GNode::GetInputsSize() const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputsSize: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputsSize: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  return op_desc->GetInputsSize();
+}
+
+size_t GNode::GetOutputsSize() const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutputsSize: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutputsSize: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  return op_desc->GetOutputsSize();
+}
+
+graphStatus GNode::GetInputDesc(const int32_t index, TensorDesc &tensor_desc) const {
+  if (index < 0) {
+    GELOGE(GRAPH_PARAM_INVALID, "GetInputDesc: index[%d] cannot be less than zero.", index);
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputDesc: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetInputDesc: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  ConstGeTensorDescPtr ge_tensor_desc = op_desc->GetInputDescPtr(static_cast<uint32_t>(index));
+  if (ge_tensor_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get tensor desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+  tensor_desc = TensorAdapter::GeTensorDesc2TensorDesc(*ge_tensor_desc);
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::UpdateInputDesc(const int32_t index, const TensorDesc &tensor_desc) {
+  if (index < 0) {
+    GELOGE(GRAPH_PARAM_INVALID, "UpdateInputDesc: index[%d] cannot be less than zero.", index);
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "UpdateInputDesc: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "UpdateInputDesc: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  GeTensorDesc ge_tensor_desc = TensorAdapter::TensorDesc2GeTensorDesc(tensor_desc);
+  if (op_desc->UpdateInputDesc(static_cast<uint32_t>(index), ge_tensor_desc) != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "Update input desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetOutputDesc(const int32_t index, TensorDesc &tensor_desc) const {
+  if (index < 0) {
+    GELOGE(GRAPH_PARAM_INVALID, "GetOutputDesc: index[%d] cannot be less than zero.", index);
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutputDesc: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetOutputDesc: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  ConstGeTensorDescPtr ge_tensor_desc = op_desc->GetOutputDescPtr(static_cast<uint32_t>(index));
+  if (ge_tensor_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get tensor desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+  tensor_desc = TensorAdapter::GeTensorDesc2TensorDesc(*ge_tensor_desc);
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::UpdateOutputDesc(const int32_t index, const TensorDesc &tensor_desc) {
+  if (index < 0) {
+    GELOGE(GRAPH_PARAM_INVALID, "Gnode: index[%d] cannot be less than zero.", index);
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "UpdateOutputDesc: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "UpdateOutputDesc: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  GeTensorDesc ge_tensor_desc = TensorAdapter::TensorDesc2GeTensorDesc(tensor_desc);
+  if (op_desc->UpdateOutputDesc(static_cast<uint32_t>(index), ge_tensor_desc) != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "Update input desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+#define NODE_ATTR_GET_IMP(ArgType)                                                       \
+  graphStatus GNode::GetAttr(const ge::AscendString &name, ArgType &attr_value) const {  \
+    const char *ascend_name = name.GetString();                                          \
+    if (ascend_name == nullptr) {                                                        \
+      GELOGE(GRAPH_PARAM_INVALID, "GetAttr: ascend string error.");                      \
+      return GRAPH_PARAM_INVALID;                                                        \
+    }                                                                                    \
+                                                                                         \
+    if (impl_ == nullptr) {                                                              \
+      GELOGE(GRAPH_FAILED, "GetAttr: node impl is nullptr.");                            \
+      return GRAPH_FAILED;                                                               \
+    }                                                                                    \
+                                                                                         \
+    std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();                            \
+    if (node_ptr == nullptr) {                                                           \
+      GELOGE(GRAPH_FAILED, "GetAttr: the shared ptr is not valid.");                     \
+      return GRAPH_FAILED;                                                               \
+    }                                                                                    \
+                                                                                         \
+    std::string node_name = ascend_name;                                                 \
+    Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);                         \
+    if (op.GetAttr(node_name, attr_value) != GRAPH_SUCCESS) {                            \
+      GELOGE(GRAPH_FAILED, "Get attr of node[%s] failed.", node_ptr->GetName().c_str()); \
+      return GRAPH_FAILED;                                                               \
+    }                                                                                    \
+                                                                                         \
+    return GRAPH_SUCCESS;                                                                \
+  }
+
+#define NODE_ATTR_SET_IMP(ArgType)                                                      \
+  graphStatus GNode::SetAttr(const ge::AscendString &name, ArgType &attr_value) const { \
+    const char *ascend_name = name.GetString();                                         \
+    if (ascend_name == nullptr) {                                                       \
+      GELOGE(GRAPH_PARAM_INVALID, "SetAttr: ascend string error.");                     \
+      return GRAPH_PARAM_INVALID;                                                       \
+    }                                                                                   \
+                                                                                        \
+    if (impl_ == nullptr) {                                                             \
+      GELOGE(GRAPH_FAILED, "SetAttr: node impl is nullptr.");                           \
+      return GRAPH_FAILED;                                                              \
+    }                                                                                   \
+                                                                                        \
+    std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();                           \
+    if (node_ptr == nullptr) {                                                          \
+      GELOGE(GRAPH_FAILED, "SetAttr: the shared ptr is not valid.");                    \
+      return GRAPH_FAILED;                                                              \
+    }                                                                                   \
+                                                                                        \
+    std::string node_name = ascend_name;                                                \
+    Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);                        \
+    (void)op.SetAttr(node_name, attr_value);                                            \
+    return GRAPH_SUCCESS;                                                               \
+  }
+
+NODE_ATTR_GET_IMP(int64_t)
+NODE_ATTR_GET_IMP(int32_t)
+NODE_ATTR_GET_IMP(uint32_t)
+NODE_ATTR_GET_IMP(float)
+NODE_ATTR_GET_IMP(bool)
+NODE_ATTR_GET_IMP(Tensor)
+NODE_ATTR_GET_IMP(std::vector<int64_t>)
+NODE_ATTR_GET_IMP(std::vector<int32_t>)
+NODE_ATTR_GET_IMP(std::vector<uint32_t>)
+NODE_ATTR_GET_IMP(std::vector<float>)
+NODE_ATTR_GET_IMP(std::vector<bool>)
+NODE_ATTR_GET_IMP(std::vector<Tensor>)
+NODE_ATTR_GET_IMP(OpBytes)
+NODE_ATTR_GET_IMP(std::vector<std::vector<int64_t>>)
+NODE_ATTR_GET_IMP(std::vector<ge::DataType>)
+NODE_ATTR_GET_IMP(ge::DataType)
+NODE_ATTR_GET_IMP(AttrValue)
+
+NODE_ATTR_SET_IMP(int64_t)
+NODE_ATTR_SET_IMP(int32_t)
+NODE_ATTR_SET_IMP(uint32_t)
+NODE_ATTR_SET_IMP(float)
+NODE_ATTR_SET_IMP(bool)
+NODE_ATTR_SET_IMP(Tensor)
+NODE_ATTR_SET_IMP(std::vector<int64_t>)
+NODE_ATTR_SET_IMP(std::vector<int32_t>)
+NODE_ATTR_SET_IMP(std::vector<uint32_t>)
+NODE_ATTR_SET_IMP(std::vector<float>)
+NODE_ATTR_SET_IMP(std::vector<bool>)
+NODE_ATTR_SET_IMP(std::vector<Tensor>)
+NODE_ATTR_SET_IMP(OpBytes)
+NODE_ATTR_SET_IMP(std::vector<std::vector<int64_t>>)
+NODE_ATTR_SET_IMP(std::vector<ge::DataType>)
+NODE_ATTR_SET_IMP(ge::DataType)
+
+graphStatus GNode::SetAttr(const ge::AscendString &name, AttrValue &attr_value) const {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "SetAttr: ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "SetAttr: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "SetAttr: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  std::string node_name = ascend_name;
+  Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);
+  (void)op.SetAttr(node_name, std::move(attr_value));
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::SetAttr(const ge::AscendString &name, ge::AscendString &attr_value) const {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "SetAttr: name ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  const char *ascend_attr_value = attr_value.GetString();
+  if (ascend_attr_value == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "SetAttr: attr value ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "SetAttr: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "SetAttr: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+  std::string node_name = ascend_name;
+  std::string node_attr_value = ascend_attr_value;
+  Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);
+  (void)op.SetAttr(node_name, node_attr_value);
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::SetAttr(const ge::AscendString &name, std::vector<ge::AscendString> &attr_values) const {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "SetAttr: name ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  for (auto &attr_val : attr_values) {
+    const char *ascend_attr_value = attr_val.GetString();
+    if (ascend_attr_value == nullptr) {
+      GELOGE(GRAPH_PARAM_INVALID, "SetAttr: attr val error.");
+      return GRAPH_PARAM_INVALID;
+    }
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "SetAttr: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "SetAttr: the shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+  vector<std::string> node_attr_vals;
+  for (auto attr_val : attr_values) {
+    if (attr_val.GetString() != nullptr) {
+      std::string node_attr_val = attr_val.GetString();
+      node_attr_vals.emplace_back(node_attr_val);
+    }
+  }
+  std::string node_name = ascend_name;
+  Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);
+  (void)op.SetAttr(node_name, node_attr_vals);
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetAttr(const ge::AscendString &name, ge::AscendString &attr_value) const {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "GetAttr: name ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetAttr: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetAttr: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  std::string node_name = ascend_name;
+  Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);
+  std::string op_name;
+  if (op.GetAttr(node_name, op_name) != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "Get attr of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  ge::AscendString attr_value_get(op_name.c_str());
+  attr_value = attr_value_get;
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetAttr(const ge::AscendString &name, std::vector<ge::AscendString> &attr_values) const {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "GetAttr: name ascend string error.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetAttr: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetAttr: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  std::string node_name = ascend_name;
+  Operator op = OpDescUtils::CreateOperatorFromNode(node_ptr);
+  vector<std::string> attr_names;
+  if (op.GetAttr(node_name, attr_names) != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "Get attr of node[%s] failed.", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  for (auto &attr_name : attr_names) {
+    AscendString ascend_attr_name(attr_name.c_str());
+    attr_values.push_back(ascend_attr_name);
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+bool GNode::HasAttr(const ge::AscendString &name) {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "HasAttr: ascend string error.");
+    return false;
+  }
+
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "HasAttr: node impl is nullptr.");
+    return false;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "HasAttr: the node shared ptr is not valid.");
+    return false;
+  }
+
+  OpDescPtr op_desc = node_ptr->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Get op desc of node[%s] failed.", node_ptr->GetName().c_str());
+    return false;
+  }
+  std::string attr_name = ascend_name;
+  if (!op_desc->HasAttr(attr_name)) {
+    GELOGE(GRAPH_FAILED, "Node[%s] has no attr name[%s]", node_ptr->GetName().c_str(), attr_name.c_str());
+    return false;
+  }
+
+  return true;
+}
+
+graphStatus GNode::GetSubgraph(uint32_t index, GraphPtr graph) const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetSubgraph: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetSubgraph: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  ComputeGraphPtr compute_graph_ptr = NodeUtils::GetSubgraph(*node_ptr, index);
+  if (compute_graph_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetSubgraph: get subgraph[%u] failed form node[%s].", index, node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+  Graph create_graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph_ptr);
+  graph = std::make_shared<Graph>(create_graph);
+  if (graph == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetSubgraph: graph make shared failed form node[%s].", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus GNode::GetALLSubgraphs(std::vector<GraphPtr> graph_list) const {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetALLSubgraphs: node impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  std::shared_ptr<Node> node_ptr = impl_->node_ptr_.lock();
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetALLSubgraphs: the node shared ptr is not valid.");
+    return GRAPH_FAILED;
+  }
+
+  std::vector<ComputeGraphPtr> sub_graphs = NodeUtils::GetAllSubgraphs(*node_ptr);
+  if (sub_graphs.empty()) {
+    GELOGE(GRAPH_FAILED, "GetALLSubgraphs: get all subgraphs failed form node[%s].", node_ptr->GetName().c_str());
+    return GRAPH_FAILED;
+  }
+
+  for (auto &sub_graph : sub_graphs) {
+    if (sub_graph == nullptr) {
+      GELOGE(GRAPH_FAILED, "Get subgraph failed form node[%s].", node_ptr->GetName().c_str());
+      return GRAPH_FAILED;
+    }
+    Graph create_graph = GraphUtils::CreateGraphFromComputeGraph(sub_graph);
+    GraphPtr graph = std::make_shared<Graph>(create_graph);
+    if (graph == nullptr) {
+      GELOGE(GRAPH_FAILED, "Subgraph make shared failed form node[%s].", node_ptr->GetName().c_str());
+      return GRAPH_FAILED;
+    }
+    graph_list.emplace_back(graph);
+  }
+
+  return GRAPH_SUCCESS;
+}
+}  // namespace ge
diff --git a/src/common/graph/graph.cc b/src/common/graph/graph.cc
index fc30e9d6..181b38d1 100644
--- a/src/common/graph/graph.cc
+++ b/src/common/graph/graph.cc
@@ -15,6 +15,7 @@
  */
 
 #include "external/graph/graph.h"
+#include <cstring>
 #include "debug/ge_util.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/debug/ge_attr_define.h"
@@ -22,6 +23,7 @@
 #include "graph/model.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/op_desc_utils.h"
+#include "graph/utils/node_adapter.h"
 
 using std::map;
 using std::pair;
@@ -242,6 +244,8 @@ class GraphImpl {
 
   const std::string &GetName() const { return name_; }
 
+  ComputeGraphPtr GetComputeGraph() const { return compute_graph_; }
+
  private:
   std::string name_;
   std::string output_name_;
@@ -261,7 +265,7 @@ graphStatus Graph::AddOp(const ge::Operator &op) {
   return impl_->AddOp(op);
 }
 
-graphStatus Graph::GetAllOpName(std::vector<string> &op_name) const {
+graphStatus Graph::GetAllOpName(std::vector<std::string> &op_name) const {
   GE_CHK_BOOL_EXEC(impl_ != nullptr, return GRAPH_FAILED,
                    "GetAllOpName failed: graph can not be used, impl is nullptr.");
   return impl_->GetAllOpName(op_name);
@@ -335,6 +339,235 @@ void Graph::SetNeedIteration(bool need_iteration) {
   impl_->SetNeedIteration(need_iteration);
 }
 
+std::vector<GNode> Graph::GetAllNodes() const {
+  std::vector<GNode> graph_nodes;
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetAllNodes: graph can not be used, impl is nullptr.");
+    return graph_nodes;
+  }
+
+  ComputeGraphPtr compute_graph_ptr = impl_->GetComputeGraph();
+  if (compute_graph_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetAllNodes: compute graph ptr is nullptr.");
+    return graph_nodes;
+  }
+
+  for (auto &node : compute_graph_ptr->GetAllNodes()) {
+    GNode gnode = NodeAdapter::Node2GNode(node);
+    graph_nodes.emplace_back(gnode);
+  }
+
+  return graph_nodes;
+}
+
+std::vector<GNode> Graph::GetDirectNode() const {
+  std::vector<GNode> graph_nodes;
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetDirectNode: graph can not be used, impl is nullptr.");
+    return graph_nodes;
+  }
+  ComputeGraphPtr compute_graph_ptr = impl_->GetComputeGraph();
+  if (compute_graph_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "GetDirectNode: compute graph ptr is nullptr.");
+    return graph_nodes;
+  }
+
+  for (auto &node : compute_graph_ptr->GetDirectNode()) {
+    GNode gnode = NodeAdapter::Node2GNode(node);
+    graph_nodes.emplace_back(gnode);
+  }
+
+  return graph_nodes;
+}
+
+graphStatus Graph::RemoveNode(GNode &node) {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "RemoveNode: graph can not be used, impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr node_ptr = NodeAdapter::GNode2Node(node);
+  if (node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "RemoveNode: gnode to  node failed.");
+    return GRAPH_FAILED;
+  }
+
+  ComputeGraphPtr compute_graph_ptr = impl_->GetComputeGraph();
+  if (compute_graph_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "RemoveNde: compute graph ptr is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  if (compute_graph_ptr->RemoveNode(node_ptr) != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "RemoveNde: remove node failed.");
+    return GRAPH_FAILED;
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus Graph::RemoveEdge(GNode &src_node, const int32_t src_port_index, GNode &dst_node,
+                              const int32_t dst_port_index) {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "RemoveEdge: graph can not be used, impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  if ((src_port_index == -1) && (dst_port_index != -1)) {
+    GELOGE(GRAPH_FAILED, "RemoveEdge:src control anchor link to dst data anchor not exists.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr src_node_ptr = NodeAdapter::GNode2Node(src_node);
+  if (src_node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "RemoveEdge: src gnode to node failed.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr dst_node_ptr = NodeAdapter::GNode2Node(dst_node);
+  if (dst_node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "RemoveEdge: dst gnode to node failed.");
+    return GRAPH_FAILED;
+  }
+
+  graphStatus res = GRAPH_FAILED;
+  if ((src_port_index == -1) && (dst_port_index == -1)) {
+    res = GraphUtils::RemoveEdge(src_node_ptr->GetOutControlAnchor(), dst_node_ptr->GetInControlAnchor());
+    if (res != GRAPH_SUCCESS) {
+      GELOGE(GRAPH_FAILED, "RemoveEdge: remove control edge failed.");
+      return GRAPH_FAILED;
+    }
+    return GRAPH_SUCCESS;
+  }
+
+  if (src_port_index != -1 && dst_port_index == -1) {
+    res = GraphUtils::RemoveEdge(src_node_ptr->GetOutDataAnchor(src_port_index), dst_node_ptr->GetInControlAnchor());
+    if (res != GRAPH_SUCCESS) {
+      GELOGE(GRAPH_FAILED, "RemoveEdge: remove data-control edge failed.");
+      return GRAPH_FAILED;
+    }
+    return GRAPH_SUCCESS;
+  }
+
+  res = GraphUtils::RemoveEdge(src_node_ptr->GetOutDataAnchor(src_port_index),
+                               dst_node_ptr->GetInDataAnchor(dst_port_index));
+  if (res != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "RemoveEdge: remove data edge failed.");
+    return GRAPH_FAILED;
+  }
+  return GRAPH_SUCCESS;
+}
+
+GNode Graph::AddNodeByOp(const Operator &op) {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddNodeByOp: graph can not be used, impl is nullptr.");
+    return GNode();
+  }
+
+  std::shared_ptr<ge::OpDesc> op_desc = ge::OpDescUtils::GetOpDescFromOperator(op);
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddNodeByOp: get op desc from op[%s] failed.", op.GetName().c_str());
+    return GNode();
+  }
+
+  ComputeGraphPtr compute_graph_ptr = impl_->GetComputeGraph();
+  if (compute_graph_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddNodeByOp: compute graph ptr is nullptr.");
+    return GNode();
+  }
+
+  NodePtr node_ptr = compute_graph_ptr->AddNode(op_desc);
+  GNode gnode = NodeAdapter::Node2GNode(node_ptr);
+
+  return gnode;
+}
+
+graphStatus Graph::AddDataEdge(GNode &src_node, const int32_t src_port_index, GNode &dst_node,
+                               const int32_t dst_port_index) {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddDataEdge: graph can not be used, impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr src_node_ptr = NodeAdapter::GNode2Node(src_node);
+  if (src_node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddDataEdge: src gnode to node failed.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr dst_node_ptr = NodeAdapter::GNode2Node(dst_node);
+  if (dst_node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddDataEdge: dst gnode to node failed.");
+    return GRAPH_FAILED;
+  }
+
+  graphStatus res =
+    GraphUtils::AddEdge(src_node_ptr->GetOutDataAnchor(src_port_index), dst_node_ptr->GetInDataAnchor(dst_port_index));
+  if (res != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "AddDataEdge: Add data edge failed.");
+    return GRAPH_FAILED;
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus Graph::AddControlEdge(GNode &src_node, GNode &dst_node) {
+  if (impl_ == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddControlEdge: graph can not be used, impl is nullptr.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr src_node_ptr = NodeAdapter::GNode2Node(src_node);
+  if (src_node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddControlEdge: src gnode to node failed.");
+    return GRAPH_FAILED;
+  }
+
+  NodePtr dst_node_ptr = NodeAdapter::GNode2Node(dst_node);
+  if (dst_node_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "AddControlEdge: dst gnode to node failed.");
+    return GRAPH_FAILED;
+  }
+
+  graphStatus res = GraphUtils::AddEdge(src_node_ptr->GetOutControlAnchor(), dst_node_ptr->GetInControlAnchor());
+  if (res != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "AddControlEdge: Add control edge failed.");
+    return GRAPH_FAILED;
+  }
+
+  return SUCCESS;
+}
+
+GraphPtr Graph::ConstructFromInputs(const std::vector<Operator> &inputs, const ge::AscendString &name) {
+  const char *ascend_name = name.GetString();
+  if (ascend_name == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "ConstructFromInputs: ascend string error.");
+    return nullptr;
+  }
+
+  if (inputs.empty()) {
+    GELOGE(GRAPH_FAILED, "ConstructFromInputs: inputs size can not be 0.");
+    return nullptr;
+  }
+
+  std::string graph_name = ascend_name;
+  ComputeGraphPtr compute_graph = GraphUtils::CreateGraphFromOperator(graph_name, inputs);
+  if (compute_graph == nullptr) {
+    GELOGE(GRAPH_FAILED, "ConstructFromInputs: create compute graph failed.");
+    return nullptr;
+  }
+
+  compute_graph->SetInputSize(static_cast<uint32_t>(inputs.size()));
+  Graph graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph);
+  GraphPtr graph_ptr = std::make_shared<Graph>(graph);
+  if (graph_ptr == nullptr) {
+    GELOGE(GRAPH_FAILED, "ConstructFromInputs: graph make shared failed.");
+    return nullptr;
+  }
+
+  return graph_ptr;
+}
+
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraphPtr GraphUtils::GetComputeGraph(const ge::Graph &graph) {
   GE_CHK_BOOL_EXEC_NOLOG(graph.IsValid(), return nullptr);
   return graph.impl_->compute_graph_;
diff --git a/src/common/graph/graph.mk b/src/common/graph/graph.mk
index 4ea84919..562439d2 100644
--- a/src/common/graph/graph.mk
+++ b/src/common/graph/graph.mk
@@ -14,6 +14,8 @@ COMMON_LOCAL_SRC_FILES := \
     ./attr_value.cc \
     ./buffer.cc \
     ./compute_graph.cc \
+    ./ascend_string.cc \
+    ./gnode.cc \
     ./graph.cc \
     ./inference_context.cc \
     ./shape_refiner.cc \
@@ -98,11 +100,13 @@ LOCAL_CPPFLAGS += -fexceptions
 
 LOCAL_C_INCLUDES := $(COMMON_LOCAL_C_INCLUDES)
 LOCAL_SRC_FILES  := \
+    ../../out/graph/lib64/stub/attr_value.cc \
     ../../out/graph/lib64/stub/graph.cc \
     ../../out/graph/lib64/stub/operator.cc \
     ../../out/graph/lib64/stub/tensor.cc \
     ../../out/graph/lib64/stub/operator_factory.cc \
-
+    ../../out/graph/lib64/stub/ascend_string.cc \
+    ../../out/graph/lib64/stub/gnode.cc \
 
 LOCAL_SHARED_LIBRARIES :=
 
@@ -128,7 +132,8 @@ LOCAL_SRC_FILES  := \
     ../../out/graph/lib64/stub/operator_factory.cc \
     ../../out/graph/lib64/stub/tensor.cc \
     ../../out/graph/lib64/stub/inference_context.cc \
-
+    ../../out/graph/lib64/stub/ascend_string.cc \
+    ../../out/graph/lib64/stub/gnode.cc \
 
 LOCAL_SHARED_LIBRARIES :=
 
@@ -173,11 +178,13 @@ LOCAL_CFLAGS += -O2
 
 LOCAL_C_INCLUDES := $(COMMON_LOCAL_C_INCLUDES)
 LOCAL_SRC_FILES  := \
+    ../../out/graph/lib64/stub/attr_value.cc \
     ../../out/graph/lib64/stub/graph.cc \
     ../../out/graph/lib64/stub/operator.cc \
     ../../out/graph/lib64/stub/tensor.cc \
     ../../out/graph/lib64/stub/operator_factory.cc \
-
+    ../../out/graph/lib64/stub/ascend_string.cc \
+    ../../out/graph/lib64/stub/gnode.cc \
 
 LOCAL_SHARED_LIBRARIES :=
 
@@ -206,6 +213,8 @@ LOCAL_SRC_FILES  := \
     ../../out/graph/lib64/stub/operator_factory.cc \
     ../../out/graph/lib64/stub/tensor.cc \
     ../../out/graph/lib64/stub/inference_context.cc \
+    ../../out/graph/lib64/stub/ascend_string.cc \
+    ../../out/graph/lib64/stub/gnode.cc \
 
 
 LOCAL_SHARED_LIBRARIES :=
diff --git a/src/common/graph/model.cc b/src/common/graph/model.cc
index a3628204..b42d8ce3 100644
--- a/src/common/graph/model.cc
+++ b/src/common/graph/model.cc
@@ -47,6 +47,7 @@ const int ACCESS_PERMISSION_BITS = 0400;
 namespace ge {
 void Model::Init() {
   (void)AttrUtils::SetInt(this, ATTR_MODEL_MEMORY_SIZE, 0);
+  (void)AttrUtils::SetInt(this, ATTR_MODEL_P2P_MEMORY_SIZE, 0);
   (void)AttrUtils::SetInt(this, ATTR_MODEL_STREAM_NUM, 0);
   (void)AttrUtils::SetInt(this, ATTR_MODEL_EVENT_NUM, 0);
   (void)AttrUtils::SetInt(this, ATTR_MODEL_LABEL_NUM, 0);
diff --git a/src/common/graph/model_serialize.cc b/src/common/graph/model_serialize.cc
index 16855fc5..1ac01b1e 100644
--- a/src/common/graph/model_serialize.cc
+++ b/src/common/graph/model_serialize.cc
@@ -409,13 +409,13 @@ bool ModelSerializeImp::HandleNodeNameRef() {
                item.dst_node_name.c_str(), item.dst_in_index);
         return false;
       }
-      GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");  // lint !e737
+      GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");
     } else {
       // Control edge
       auto src_anchor = src_node_it->second->GetOutControlAnchor();
       auto dst_anchor = item.dst_node->GetInControlAnchor();
       if (src_anchor != nullptr && dst_anchor != nullptr) {
-        GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");  // lint !e737
+        GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");
       }
     }
   }
diff --git a/src/common/graph/op_desc.cc b/src/common/graph/op_desc.cc
index dee0aece..35cbf3f7 100644
--- a/src/common/graph/op_desc.cc
+++ b/src/common/graph/op_desc.cc
@@ -33,7 +33,6 @@ using std::shared_ptr;
 using std::string;
 using std::vector;
 
-/*lint -save -e521 -e681 -e732 -e737*/
 namespace ge {
 const std::string ATTR_NAME_ID = "id";
 
diff --git a/src/common/graph/operator.cc b/src/common/graph/operator.cc
index 21554fa1..d196a3a7 100644
--- a/src/common/graph/operator.cc
+++ b/src/common/graph/operator.cc
@@ -56,9 +56,6 @@ using std::string;
 using std::to_string;
 using std::vector;
 
-/*lint -save -e529 -e728*/
-/*lint -e446 -e732*/
-/*lint -e665*/
 namespace ge {
 class OpIO {
  public:
@@ -768,6 +765,8 @@ const std::map<GeAttrValue::ValueType, std::string> kAttrTypesMap = {
   {GeAttrValue::VT_BYTES, "VT_BYTES"},
   {GeAttrValue::VT_GRAPH, "VT_GRAPH"},
   {GeAttrValue::VT_NAMED_ATTRS, "VT_NAMED_ATTRS"},
+  {GeAttrValue::VT_LIST_LIST_INT, "VT_LIST_LIST_INT"},
+  {GeAttrValue::VT_DATA_TYPE, "VT_DATA_TYPE"},
   {GeAttrValue::VT_LIST_BASE, "VT_LIST_BASE"},
   {GeAttrValue::VT_LIST_STRING, "VT_LIST_STRING"},
   {GeAttrValue::VT_LIST_FLOAT, "VT_LIST_FLOAT"},
@@ -778,6 +777,7 @@ const std::map<GeAttrValue::ValueType, std::string> kAttrTypesMap = {
   {GeAttrValue::VT_LIST_BYTES, "VT_LIST_BYTES"},
   {GeAttrValue::VT_GRAPH, "VT_GRAPH"},
   {GeAttrValue::VT_LIST_NAMED_ATTRS, "VT_LIST_NAMED_ATTRS"},
+  {GeAttrValue::VT_LIST_DATA_TYPE, "VT_LIST_DATA_TYPE"},
 };
 }  // namespace
 const std::map<std::string, std::string> Operator::GetAllAttrNamesAndTypes() const {
@@ -943,7 +943,7 @@ OperatorImplPtr Operator::GetOperatorImplPtr() const { return operator_impl_; }
       GELOGW("set attr name %s failed.", name.c_str());                                     \
     }                                                                                       \
     return *this;                                                                           \
-  }  // lint !e665
+  }
 
 #define OP_ATTR_GET_IMP(ArgType, AttrUtilsFun)                                              \
   graphStatus Operator::GetAttr(const string &name, ArgType attr_value) const {             \
@@ -956,7 +956,7 @@ OperatorImplPtr Operator::GetOperatorImplPtr() const { return operator_impl_; }
       return GRAPH_FAILED;                                                                  \
     }                                                                                       \
     return GRAPH_SUCCESS;                                                                   \
-  }  // lint !e665
+  }
 
 void Operator::BreakConnect() const {
   if (operator_impl_ == nullptr) {
@@ -977,7 +977,7 @@ void Operator::BreakConnect() const {
     if (!AttrUtils::Set##AttrUtilsFun(operator_impl_->GetOpDescImpl(), name, attr_value)) { \
       GELOGW("reg attr name %s failed.", name.c_str());                                     \
     }                                                                                       \
-  }  // lint !e665
+  }
 
 OP_ATTR_SET_IMP(int64_t, Int)
 OP_ATTR_SET_IMP(int32_t, Int)
@@ -998,22 +998,22 @@ OP_ATTR_SET_IMP(const vector<vector<int64_t>> &, ListListInt)
 OP_ATTR_SET_IMP(float, Float)
 OP_ATTR_GET_IMP(float &, Float)
 OP_ATTR_SET_IMP(const vector<float> &, ListFloat)
-OP_ATTR_GET_IMP(vector<float> &, ListFloat)  // lint !e665
+OP_ATTR_GET_IMP(vector<float> &, ListFloat)
 
 OP_ATTR_SET_IMP(bool, Bool)
 OP_ATTR_GET_IMP(bool &, Bool)
 OP_ATTR_SET_IMP(const vector<bool> &, ListBool)
-OP_ATTR_GET_IMP(vector<bool> &, ListBool)  // lint !e665
+OP_ATTR_GET_IMP(vector<bool> &, ListBool)
 
 OP_ATTR_SET_IMP(const string &, Str)
 OP_ATTR_GET_IMP(string &, Str)
 OP_ATTR_SET_IMP(const vector<string> &, ListStr)
-OP_ATTR_GET_IMP(vector<string> &, ListStr)  // lint !e665
+OP_ATTR_GET_IMP(vector<string> &, ListStr)
 
 OP_ATTR_SET_IMP(const GeAttrValue::NAMED_ATTRS &, NamedAttrs)
 OP_ATTR_GET_IMP(GeAttrValue::NAMED_ATTRS &, NamedAttrs)
 OP_ATTR_SET_IMP(const vector<GeAttrValue::NAMED_ATTRS> &, ListNamedAttrs)
-OP_ATTR_GET_IMP(vector<GeAttrValue::NAMED_ATTRS> &, ListNamedAttrs)  // lint !e665
+OP_ATTR_GET_IMP(vector<GeAttrValue::NAMED_ATTRS> &, ListNamedAttrs)
 
 OP_ATTR_REG_IMP(int64_t, Int)
 OP_ATTR_REG_IMP(const vector<int64_t> &, ListInt)
@@ -1583,5 +1583,3 @@ void GraphUtils::BreakConnect(const std::map<OperatorImplPtr, NodePtr> &all_node
   }
 }
 }  // namespace ge
-/*lint +e446 +e732*/
-/*lint +e665*/
diff --git a/src/common/graph/opsproto/opsproto_manager.cc b/src/common/graph/opsproto/opsproto_manager.cc
index d482715b..397e02cd 100644
--- a/src/common/graph/opsproto/opsproto_manager.cc
+++ b/src/common/graph/opsproto/opsproto_manager.cc
@@ -38,9 +38,7 @@ bool OpsProtoManager::Initialize(const std::map<std::string, std::string> &optio
     return true;
   }
 
-  /*lint -e1561*/
   auto proto_iter = options.find("ge.opsProtoLibPath");
-  /*lint +e1561*/
   if (proto_iter == options.end()) {
     GELOGW("ge.opsProtoLibPath option not set, return.");
     return false;
diff --git a/src/common/graph/option/ge_context.cc b/src/common/graph/option/ge_context.cc
index 421e0aff..523841e2 100644
--- a/src/common/graph/option/ge_context.cc
+++ b/src/common/graph/option/ge_context.cc
@@ -31,6 +31,8 @@ GEContext &GetContext() {
   return ge_context;
 }
 
+thread_local uint64_t GEContext::session_id_;
+
 graphStatus GEContext::GetOption(const std::string &key, std::string &option) {
   return GetThreadLocalContext().GetOption(key, option);
 }
diff --git a/src/common/graph/option/ge_local_context.cc b/src/common/graph/option/ge_local_context.cc
index 82b1cb01..99c12c51 100644
--- a/src/common/graph/option/ge_local_context.cc
+++ b/src/common/graph/option/ge_local_context.cc
@@ -57,4 +57,18 @@ void GEThreadLocalContext::SetGraphOption(map<std::string, string> options_map)
   graph_options_.clear();
   graph_options_ = std::move(options_map);
 }
+
+map<string, string> GEThreadLocalContext::GetAllGraphOptions() const { return graph_options_; }
+
+map<string, string> GEThreadLocalContext::GetAllSessionOptions() const { return session_options_; }
+
+map<string, string> GEThreadLocalContext::GetAllGlobalOptions() const { return global_options_; }
+
+map<string, string> GEThreadLocalContext::GetAllOptions() const {
+  map<string, string> options_all;
+  options_all.insert(graph_options_.begin(), graph_options_.end());
+  options_all.insert(session_options_.begin(), session_options_.end());
+  options_all.insert(global_options_.begin(), global_options_.end());
+  return options_all;
+}
 }  // namespace ge
diff --git a/src/common/graph/shape_refiner.cc b/src/common/graph/shape_refiner.cc
index 17423da4..e3643e7b 100644
--- a/src/common/graph/shape_refiner.cc
+++ b/src/common/graph/shape_refiner.cc
@@ -365,6 +365,37 @@ string Serial(const vector<int64_t> &dims) {
   return serial_string;
 }
 
+void SerialShapeRange(const GeTensorDescPtr &desc, std::string &desc_str) {
+  desc_str += "[";
+  std::vector<std::pair<int64_t, int64_t>> shape_range;
+  (void)desc->GetShapeRange(shape_range);
+  for (const auto &pair : shape_range) {
+    desc_str += "{";
+    desc_str += std::to_string(pair.first) + "," + std::to_string(pair.second);
+    desc_str += "},";
+  }
+  desc_str += "] ";
+}
+
+void SerialShapeAndDtype(const GeTensorDescPtr &desc, bool is_origin_info, std::string &desc_str) {
+  desc_str += "[";
+  if (!is_origin_info) {
+    for (int64_t dim : desc->GetShape().GetDims()) {
+      desc_str += std::to_string(dim) + " ";
+    }
+    desc_str += "]";
+    desc_str += ":" + TypeUtils::DataTypeToSerialString(desc->GetDataType()) + ":" +
+                TypeUtils::FormatToSerialString(desc->GetFormat()) + " ";
+  } else {
+    for (int64_t dim : desc->GetOriginShape().GetDims()) {
+      desc_str += std::to_string(dim) + " ";
+    }
+    desc_str += "]";
+    desc_str += ":" + TypeUtils::DataTypeToSerialString(desc->GetOriginDataType()) + ":" +
+                TypeUtils::FormatToSerialString(desc->GetOriginFormat()) + " ";
+  }
+}
+
 graphStatus UpdateOpInputDesc(const ConstNodePtr &node_ptr) {
   GE_IF_BOOL_EXEC(node_ptr == nullptr, GELOGE(GRAPH_FAILED, "node is null."); return GRAPH_FAILED);
   GE_IF_BOOL_EXEC(node_ptr->GetOpDesc() == nullptr, GELOGE(GRAPH_FAILED, "op_desc is null."); return GRAPH_FAILED);
@@ -386,9 +417,9 @@ graphStatus UpdateOpInputDesc(const ConstNodePtr &node_ptr) {
     if (in_desc == nullptr) {
       continue;
     }
-    auto in_shape = in_desc->GetShape().GetDims();
+    auto in_shape = in_desc->MutableShape().GetDims();
     auto in_dtype = in_desc->GetDataType();
-    auto peer_out_shape = peer_out_desc->GetShape().GetDims();
+    auto peer_out_shape = peer_out_desc->MutableShape().GetDims();
     auto peer_out_dtype = peer_out_desc->GetDataType();
     if (peer_out_dtype != in_dtype) {
       GELOGW(
@@ -407,13 +438,15 @@ graphStatus UpdateOpInputDesc(const ConstNodePtr &node_ptr) {
     }
     // refresh current node input desc
     in_desc->SetOriginShape(peer_out_desc->GetOriginShape());
-    in_desc->SetShape(peer_out_desc->GetShape());
+    in_desc->SetShape(peer_out_desc->MutableShape());
     in_desc->SetDataType(peer_out_desc->GetDataType());
     in_desc->SetOriginDataType(peer_out_desc->GetOriginDataType());
-    std::vector<std::pair<int64_t, int64_t>> shape_range;
-    (void)peer_out_desc->GetShapeRange(shape_range);
-    in_desc->SetShapeRange(shape_range);
-    ge::TensorUtils::SetRealDimCnt(*in_desc, static_cast<uint32_t>(peer_out_desc->GetShape().GetDims().size()));
+    if (peer_out_desc->MutableShape().GetDims() != UNKNOWN_RANK) {
+      std::vector<std::pair<int64_t, int64_t>> shape_range;
+      (void)peer_out_desc->GetShapeRange(shape_range);
+      in_desc->SetShapeRange(shape_range);
+    }
+    ge::TensorUtils::SetRealDimCnt(*in_desc, static_cast<uint32_t>(peer_out_desc->MutableShape().GetDims().size()));
   }
   return GRAPH_SUCCESS;
 }
@@ -432,25 +465,19 @@ void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::str
   if (op_desc->GetInputsSize() != 0) {
     std::string input_desc_str = "input shape: ";
     for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
-      input_desc_str += "[";
-      for (int64_t dim : input_desc->GetShape().GetDims()) {
-        input_desc_str += std::to_string(dim) + " ";
-      }
-      input_desc_str += "]";
-      input_desc_str += ":" + TypeUtils::DataTypeToSerialString(input_desc->GetDataType()) + ":" +
-                        TypeUtils::FormatToSerialString(input_desc->GetFormat()) + " ";
+      SerialShapeAndDtype(input_desc, false, input_desc_str);
     }
     str += input_desc_str;
 
     input_desc_str = "input origin shape: ";
     for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
-      input_desc_str += "[";
-      for (int64_t dim : input_desc->GetOriginShape().GetDims()) {
-        input_desc_str += std::to_string(dim) + " ";
-      }
-      input_desc_str += "]";
-      input_desc_str += ":" + TypeUtils::DataTypeToSerialString(input_desc->GetOriginDataType()) + ":" +
-                        TypeUtils::FormatToSerialString(input_desc->GetOriginFormat()) + " ";
+      SerialShapeAndDtype(input_desc, true, input_desc_str);
+    }
+    str += input_desc_str;
+
+    input_desc_str = "input shape range: ";
+    for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
+      SerialShapeRange(input_desc, input_desc_str);
     }
     str += input_desc_str;
   }
@@ -461,13 +488,7 @@ void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::str
       if (output_desc == nullptr) {
         continue;
       }
-      output_desc_str += "[";
-      for (int64_t dim : output_desc->GetShape().GetDims()) {
-        output_desc_str += std::to_string(dim) + " ";
-      }
-      output_desc_str += "]";
-      output_desc_str += ":" + TypeUtils::DataTypeToSerialString(output_desc->GetDataType()) + ":" +
-                         TypeUtils::FormatToSerialString(output_desc->GetFormat()) + " ";
+      SerialShapeAndDtype(output_desc, false, output_desc_str);
     }
     str += output_desc_str;
 
@@ -476,13 +497,13 @@ void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::str
       if (output_desc == nullptr) {
         continue;
       }
-      output_desc_str += "[";
-      for (int64_t dim : output_desc->GetOriginShape().GetDims()) {
-        output_desc_str += std::to_string(dim) + " ";
-      }
-      output_desc_str += "]";
-      output_desc_str += ":" + TypeUtils::DataTypeToSerialString(output_desc->GetOriginDataType()) + ":" +
-                         TypeUtils::FormatToSerialString(output_desc->GetOriginFormat()) + " ";
+      SerialShapeAndDtype(output_desc, true, output_desc_str);
+    }
+    str += output_desc_str;
+
+    output_desc_str = "output shape range: ";
+    for (const auto &output_desc : op_desc->GetAllOutputsDescPtr()) {
+      SerialShapeRange(output_desc, output_desc_str);
     }
     str += output_desc_str;
   }
diff --git a/src/common/graph/stub/Makefile b/src/common/graph/stub/Makefile
deleted file mode 100644
index f339fa33..00000000
--- a/src/common/graph/stub/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-inc_path := $(shell pwd)/metadef/inc/external/
-out_path := $(shell pwd)/out/graph/lib64/stub/
-stub_path := $(shell pwd)/metadef/graph/stub/
-
-mkdir_stub := $(shell mkdir -p $(out_path))
-graph_local_stub := $(shell $(HI_PYTHON) $(stub_path)/gen_stubapi.py $(inc_path) $(out_path))
diff --git a/src/common/graph/stub/gen_stubapi.py b/src/common/graph/stub/gen_stubapi.py
deleted file mode 100644
index 7263ff17..00000000
--- a/src/common/graph/stub/gen_stubapi.py
+++ /dev/null
@@ -1,578 +0,0 @@
-import os
-import re
-import sys
-import logging
-
-logging.basicConfig(stream=sys.stdout, format='[%(asctime)s] [%(lineno)s] %(levelname)s: %(message)s',
-                    level=logging.INFO)
-
-"""
-    this attr is used for symbol table visible
-"""
-GE_ATTR = 'GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY'
-
-"""
-    generate stub func body by return type
-"""
-RETURN_STATEMENTS = {
-    'graphStatus': '    std::cout << "[ERROR]: stub library libgraph or libge_compiler cannot be used for execution, please check your "\n '
-                   '        << "environment variables and compilation options to make sure you use the correct library."\n'
-                   '        << std::endl;\n'
-                   '    return ACL_ERROR_COMPILING_STUB_MODE;',
-    'Status': '    return SUCCESS;',
-    'Graph': '    return Graph();',
-    'Graph&': '    return *this;',
-    'Format': '    return Format();',
-    'Format&': '    return *this;',
-    'Shape': '    return Shape();',
-    'Shape&': '    return *this;',
-    'TensorDesc': '    return TensorDesc();',
-    'TensorDesc&': '    return *this;',
-    'Tensor': '    return Tensor();',
-    'Tensor&': '    return *this;',
-    'Operator': '    return Operator();',
-    'Operator&': '    return *this;',
-    'Ptr': '    return nullptr;',
-    'std::string': '    return "";',
-    'std::string&': '    return "";',
-    'string': ' return "";',
-    'int': '    return 0;',
-    'DataType': '    return DT_FLOAT;',
-    'InferenceContextPtr': '    return nullptr;',
-    'SubgraphBuilder': '    return nullptr;',
-    'OperatorImplPtr': '    return nullptr;',
-    'OutHandler': '    return nullptr;',
-    'std::vector<std::string>': '    return {};',
-    'std::vector<int64_t>': '    return {};',
-    'std::map': '    return {};',
-    'uint32_t': '    return 0;',
-    'int64_t': '    return 0;',
-    'uint64_t': '    return 0;',
-    'size_t': '    return 0;',
-    'float': '    return 0.0f;',
-    'bool': '    return false;',
-}
-
-"""
-    max code len per line in hua_wei software programming specifications
-"""
-max_code_len_per_line = 100
-
-"""
-    white_list_for_debug, include_dir_key_words is to
-    determines which header files to generate cc files from
-    when DEBUG on
-"""
-white_list_for_debug = ["attr_value.h", "operator.h", "tensor.h", "graph.h", "operator_factory.h", "inference_context.h",
-                        "ge_ir_build.h", "ge_api.h", "ascend_string.h", "gnode.h"]
-include_dir_key_words = ["ge", "graph"]
-DEBUG = True
-
-
-def need_generate_func(func_line):
-    """
-    :param func_line:
-    :return:
-    """
-    if func_line.strip().endswith("default") or func_line.strip().endswith("delete") \
-            or func_line.strip().startswith("typedef") or func_line.strip().startswith("using"):
-        return False
-    return True
-
-
-def file_endswith_white_list_suffix(file):
-    """
-    :param file:
-    :return:
-    """
-    if DEBUG:
-        for suffix in white_list_for_debug:
-            if file.endswith(suffix):
-                return True
-        return False
-    else:
-        return True
-
-
-"""
-    belows are patterns used for analyse .h file
-"""
-# pattern function
-pattern_func = re.compile(r"""(^[\s]*)          #leading with space,we will find and delete after
-([a-zA-Z~_]            # void int likely
-.*
-[)]                     #we find )
-(?!.*{)                 # we do not want the case int abc() const
-.*)
-(;.*)                   #we want to find ; and after for we will replace these later
-\n$
-""", re.VERBOSE | re.MULTILINE | re.DOTALL)
-
-# pattern comment
-pattern_comment = re.compile(r'^\s*//')
-pattern_comment_2_start = re.compile(r'^\s*/[*]')
-pattern_comment_2_end = re.compile(r'[*]/\s*$')
-# pattern define
-pattern_define = re.compile(r'^\s*#define')
-pattern_define_return = re.compile(r'\\\s*$')
-# blank line
-pattern_blank_line = re.compile(r'^\s*$')
-# virtual,explicit,friend,static
-pattern_keyword = re.compile(r'(virtual\s+|explicit\s+|friend\s+|static\s+)')
-# lead space
-pattern_leading_space = re.compile(r'(^[\s]*)[a-zA-Z~_]')
-# functions will have patterns such as func ( or func(
-# but operator is an exception; the class name is preceded by an operator, and the above mode does not exist
-# format like :"operator = ()"
-pattern_func_name = re.compile(r'([a-zA-Z0-9~_\-]+\s*|operator?.*)[(]')
-# template
-pattern_template = re.compile(r'^\s*template')
-pattern_template_end = re.compile(r'>\s*$')
-# namespace
-pattern_namespace = re.compile(r'namespace.*{')
-# class : which can handle classA a and {not on the same line, but if found ';' after class,then don't deal with
-pattern_class = re.compile(r'^[\s]*(class|struct)\s+(%s\s+)?([a-zA-Z0-9_\-]+<?)(?!.*;)' % GE_ATTR)
-# {}
-pattern_start = re.compile('{')
-pattern_end = re.compile('}')
-
-line_index = 0
-
-
-class H2CC(object):
-    def __init__(self, input_file, output_file, shared_includes_content):
-        """
-        :param input_file:
-        :param output_file:
-        :param shared_includes_content:
-        """
-        self.input_file = input_file
-        self.output_file = output_file
-        self.shared_includes_content = shared_includes_content
-        self.line_index = 0
-        self.input_fd = open(self.input_file, 'r')
-        self.input_content = self.input_fd.readlines()
-        self.output_fd = open(self.output_file, 'w')
-
-        # The state may be normal_now(in the middle of {}),class_now,namespace_now
-        self.stack = []
-        self.stack_class = []
-        self.stack_template = []
-        # record funcs generated by h2cc func
-        self.func_list_exist = []
-
-    def __del__(self):
-        self.input_fd.close()
-        self.output_fd.close()
-        del self.stack
-        del self.stack_class
-        del self.stack_template
-        del self.func_list_exist
-
-    def just_skip(self):
-        # skip blank line or comment
-        if pattern_blank_line.search(self.input_content[self.line_index]) or pattern_comment.search(
-                self.input_content[self.line_index]):  # /n or comment using //
-            self.line_index += 1
-        if pattern_comment_2_start.search(self.input_content[self.line_index]):  # comment using /*
-            while not pattern_comment_2_end.search(self.input_content[self.line_index]):  # */
-                self.line_index += 1
-            self.line_index += 1
-        # skip define
-        if pattern_define.search(self.input_content[self.line_index]):
-            while pattern_blank_line.search(self.input_content[self.line_index]) or pattern_define_return.search(
-                    self.input_content[self.line_index]):
-                self.line_index += 1
-            self.line_index += 1
-
-    def write_inc_content(self):
-        for shared_include_content in self.shared_includes_content:
-            self.output_fd.write(shared_include_content)
-
-    def h2cc(self):
-        """
-        :return:
-        """
-        logging.info("start generate cc_file[%s] from h_file[%s]", self.output_file, self.input_file)
-        global pattern_comment
-        global pattern_comment_2_start
-        global pattern_comment_2_end
-        global pattern_blank_line
-        global pattern_func
-        global pattern_keyword
-        global pattern_leading_space
-        global pattern_func_name
-        global pattern_template
-        global pattern_template_end
-        global pattern_namespace
-        global pattern_class
-        global pattern_start
-        global pattern_end
-        global line_index
-        # write inc content
-        self.write_inc_content()
-        # core processing cycle, process the input .h file by line
-        while self.line_index < len(self.input_content):
-            # handle comment and blank line
-            self.just_skip()
-
-            # match namespace
-            self.handle_namespace()
-
-            # match template
-            template_string = self.handle_template()
-            # match class
-            line = self.input_content[self.line_index]
-            match_class = pattern_class.search(line)
-            match_start = pattern_start.search(line)
-            handle_class_result = self.handle_class(template_string, line, match_start, match_class)
-            if handle_class_result == "continue":
-                continue
-
-            # match "}"
-            handle_stack_result = self.handle_stack(match_start)
-            if handle_stack_result == "continue":
-                continue
-            # handle func
-            handle_func1_result, line, start_i = self.handle_func1(line)
-            if handle_func1_result == "continue":
-                continue
-
-            # here means func is found
-            # delete key word
-            line = pattern_keyword.sub('', line)
-            logging.info("line[%s]", line)
-
-            # Class member function
-            # if friend we will not add class name
-            friend_match = re.search('friend ', line)
-            if len(self.stack_class) > 0 and not friend_match:
-                line, func_name = self.handle_class_member_func(line, template_string)
-            # Normal functions
-            else:
-                line, func_name = self.handle_normal_func(line, template_string)
-
-            need_generate = need_generate_func(line)
-            # func body
-            line += self.implement_function(line)
-            # comment
-            line = self.gen_comment(start_i) + line
-            # write to out file
-            self.write_func_content(line, func_name, need_generate)
-            # next loop
-            self.line_index += 1
-
-        logging.info('Added %s functions', len(self.func_list_exist))
-        logging.info('Successfully converted,please see ' + self.output_file)
-
-    def handle_func1(self, line):
-        """
-        :param line:
-        :return:
-        """
-        find1 = re.search('[(]', line)
-        if not find1:
-            self.line_index += 1
-            return "continue", line, None
-        find2 = re.search('[)]', line)
-        start_i = self.line_index
-        space_match = pattern_leading_space.search(line)
-        # deal with
-        # int abc(int a,
-        #        int b)
-        if find1 and (not find2):
-            self.line_index += 1
-            line2 = self.input_content[self.line_index]
-            if space_match:
-                line2 = re.sub('^' + space_match.group(1), '', line2)
-            line += line2
-            while self.line_index < len(self.input_content) and (not re.search('[)]', line2)):
-                self.line_index += 1
-                line2 = self.input_content[self.line_index]
-                line2 = re.sub('^' + space_match.group(1), '', line2)
-                line += line2
-
-        match_start = pattern_start.search(self.input_content[self.line_index])
-        match_end = pattern_end.search(self.input_content[self.line_index])
-        if match_start:  # like  ) {  or ) {}    int the last line
-            if not match_end:
-                self.stack.append('normal_now')
-            ii = start_i
-            while ii <= self.line_index:
-                ii += 1
-            self.line_index += 1
-            return "continue", line, start_i
-        logging.info("line[%s]", line)
-        # '  int abc();'->'int abc()'
-        (line, match) = pattern_func.subn(r'\2\n', line)
-        logging.info("line[%s]", line)
-        # deal with case:
-        # 'int \n abc(int a, int b)'
-        if re.search(r'^\s*(inline)?\s*[a-zA-Z0-9_]+\s*$', self.input_content[start_i - 1]):
-            line = self.input_content[start_i - 1] + line
-        line = line.lstrip()
-        if not match:
-            self.line_index += 1
-            return "continue", line, start_i
-        return "pass", line, start_i
-
-    def handle_stack(self, match_start):
-        """
-        :param match_start:
-        :return:
-        """
-        line = self.input_content[self.line_index]
-        match_end = pattern_end.search(line)
-        if match_start:
-            self.stack.append('normal_now')
-        if match_end:
-            top_status = self.stack.pop()
-            if top_status == 'namespace_now':
-                self.output_fd.write(line + '\n')
-            elif top_status == 'class_now':
-                self.stack_class.pop()
-                self.stack_template.pop()
-        if match_start or match_end:
-            self.line_index += 1
-            return "continue"
-
-        if len(self.stack) > 0 and self.stack[-1] == 'normal_now':
-            self.line_index += 1
-            return "continue"
-        return "pass"
-
-    def handle_class(self, template_string, line, match_start, match_class):
-        """
-        :param template_string:
-        :param line:
-        :param match_start:
-        :param match_class:
-        :return:
-        """
-        if match_class:  # we face a class
-            self.stack_template.append(template_string)
-            self.stack.append('class_now')
-            class_name = match_class.group(3)
-
-            # class template specializations: class A<u,Node<u> >
-            if '<' in class_name:
-                k = line.index('<')
-                fit = 1
-                for ii in range(k + 1, len(line)):
-                    if line[ii] == '<':
-                        fit += 1
-                    if line[ii] == '>':
-                        fit -= 1
-                    if fit == 0:
-                        break
-                class_name += line[k + 1:ii + 1]
-            logging.info('class_name[%s]', class_name)
-            self.stack_class.append(class_name)
-            while not match_start:
-                self.line_index += 1
-                line = self.input_content[self.line_index]
-                match_start = pattern_start.search(line)
-            self.line_index += 1
-            return "continue"
-        return "pass"
-
-    def handle_template(self):
-        line = self.input_content[self.line_index]
-        match_template = pattern_template.search(line)
-        template_string = ''
-        if match_template:
-            match_template_end = pattern_template_end.search(line)
-            template_string = line
-            while not match_template_end:
-                self.line_index += 1
-                line = self.input_content[self.line_index]
-                template_string += line
-                match_template_end = pattern_template_end.search(line)
-            self.line_index += 1
-        return template_string
-
-    def handle_namespace(self):
-        line = self.input_content[self.line_index]
-        match_namespace = pattern_namespace.search(line)
-        if match_namespace:  # we face namespace
-            self.output_fd.write(line + '\n')
-            self.stack.append('namespace_now')
-            self.line_index += 1
-
-    def handle_normal_func(self, line, template_string):
-        template_line = ''
-        self.stack_template.append(template_string)
-        if self.stack_template[-1] != '':
-            template_line = re.sub(r'\s*template', 'template', self.stack_template[-1])
-            # change '< class T = a, class U = A(3)>' to '<class T, class U>'
-            template_line = re.sub(r'\s*=.*>(\s*)$', r'>\1', template_line)
-            template_line = re.sub(r'\s*=.*,', ',', template_line)
-            template_line = re.sub(r'\s*=.*', '', template_line)
-        line = re.sub(r'\s*=.*,', ',', line)
-        line = re.sub(r'\s*=.*\)', ')', line)
-        line = template_line + line
-        self.stack_template.pop()
-        func_name = re.search(r'^.*\)', line, re.MULTILINE | re.DOTALL).group()
-        logging.info("line[%s]", line)
-        logging.info("func_name[%s]", func_name)
-        return line, func_name
-
-    def handle_class_member_func(self, line, template_string):
-        template_line = ''
-        x = ''
-        if template_string != '':
-            template_string = re.sub(r'\s*template', 'template', template_string)
-            template_string = re.sub(r'\s*=.*>(\s*)$', r'>\1', template_string)
-            template_string = re.sub(r'\s*=.*,', ',', template_string)
-            template_string = re.sub(r'\s*=.*', '', template_string)
-        if self.stack_template[-1] != '':
-            if not (re.search(r'<\s*>', stack_template[-1])):
-                template_line = re.sub(r'^\s*template', 'template', stack_template[-1])
-                if not (re.search(r'<.*>', self.stack_class[-1])):
-                    # for x we get like template<class T, typename U> -> <T,U>
-                    x = re.sub(r'template\s*<', '<', template_line)  # remove template -> <class T, typename U>
-                    x = re.sub(r'\n', '', x)
-                    x = re.sub(r'\s*=.*,', ',', x)
-                    x = re.sub(r'\s*=.*\>', '>', x)
-                    x = x.rstrip()  # remove \n
-                    x = re.sub(r'(class|typename)\s+|(<class>|<typename>\s*class)', '',
-                               x)  # remove class,typename ->  <T, U>
-                    x = re.sub(r'<\s+', '<', x)
-                    x = re.sub(r'\s+>', '>', x)
-                    x = re.sub(r'\s+,', ',', x)
-                    x = re.sub(r',\s+', ', ', x)
-        line = re.sub(r'\s*=\s+0', '', line)
-        line = re.sub(r'\s*=\s+.*,', ',', line)
-        line = re.sub(r'\s*=\s+.*\)', ')', line)
-        logging.info("x[%s]\nline[%s]", x, line)
-        # if the function is long, void ABC::foo()
-        # breaks into two lines void ABC::\n foo()
-        temp_line = pattern_func_name.sub(self.stack_class[-1] + x + '::' + r'\1(', line, count=1)
-        if len(temp_line) > max_code_len_per_line:
-            line = pattern_func_name.sub(self.stack_class[-1] + x + '::\n' + r'\1(', line, count=1)
-        else:
-            line = temp_line
-        logging.info("line[%s]", line)
-        # add template as the above if there is one
-        template_line = re.sub(r'\s*=.*>(\s*)$', r'>\1', template_line)
-        template_line = re.sub(r'\s*=.*,', ',', template_line)
-        template_line = re.sub(r'\s*=.*', '', template_line)
-        line = template_line + template_string + line
-        func_name = re.search(r'^.*\)', line, re.MULTILINE | re.DOTALL).group()
-        logging.info("line[%s]", line)
-        logging.info("func_name[%s]", func_name)
-        return line, func_name
-
-    def write_func_content(self, content, func_name, need_generate):
-        if not (func_name in self.func_list_exist) and need_generate:
-            self.output_fd.write(content)
-            self.func_list_exist.append(func_name)
-            logging.info('add func:[%s]', func_name)
-
-    def gen_comment(self, start_i):
-        comment_line = ''
-        # Function comments are on top of function declarations, copy them over
-        k = start_i - 1  # one line before this func start
-        if pattern_template.search(self.input_content[k]):
-            k -= 1
-        if pattern_comment_2_end.search(self.input_content[k]):
-            comment_line = self.input_content[k].lstrip()
-            while not pattern_comment_2_start.search(self.input_content[k]):
-                k -= 1
-                comment_line = self.input_content[k].lstrip() + comment_line
-        else:
-            for j in range(k, 0, -1):
-                c_line = self.input_content[j]
-                if pattern_comment.search(c_line):
-                    c_line = re.sub(r'\s*//', '//', c_line)
-                    comment_line = c_line + comment_line
-                else:
-                    break
-        return comment_line
-
-    @staticmethod
-    def implement_function(func):
-        function_def = ''
-        function_def += '{\n'
-
-        all_items = func.split()
-        start = 0
-        return_type = all_items[start]
-        if return_type == "const":
-            start += 1
-            return_type = all_items[start]
-        if return_type.startswith(('std::map', 'std::set', 'std::vector')):
-            return_type = "std::map"
-        if return_type.endswith('*') or (len(all_items) > start + 1 and all_items[start + 1].startswith('*')):
-            return_type = "Ptr"
-        if len(all_items) > start + 1 and all_items[start + 1].startswith('&'):
-            return_type += "&"
-        if RETURN_STATEMENTS.__contains__(return_type):
-            function_def += RETURN_STATEMENTS[return_type]
-        else:
-            logging.warning("Unhandled return type[%s]", return_type)
-
-        function_def += '\n'
-        function_def += '}\n'
-        function_def += '\n'
-        return function_def
-
-
-def collect_header_files(path):
-    """
-    :param path:
-    :return:
-    """
-    header_files = []
-    shared_includes_content = []
-    for root, dirs, files in os.walk(path):
-        files.sort()
-        for file in files:
-            if file.find("git") >= 0:
-                continue
-            if not file.endswith('.h'):
-                continue
-            file_path = os.path.join(root, file)
-            file_path = file_path.replace('\\', '/')
-            header_files.append(file_path)
-            include_str = '#include "{}"\n'.format(file_path[path.rindex('/') + 1:])
-            shared_includes_content.append(include_str)
-    # for acl error code
-    shared_includes_content.append('#include <iostream>\n')
-    shared_includes_content.append('const int ACL_ERROR_COMPILING_STUB_MODE = 100039;\n')
-    return header_files, shared_includes_content
-
-
-def generate_stub_file(inc_dir, out_cc_dir):
-    """
-    :param inc_dir:
-    :param out_cc_dir:
-    :return:
-    """
-    target_header_files, shared_includes_content = collect_header_files(inc_dir)
-    for header_file in target_header_files:
-        if not file_endswith_white_list_suffix(header_file):
-            continue
-        cc_file = re.sub('.h*$', '.cc', header_file)
-        h_2_cc = H2CC(header_file, out_cc_dir + cc_file[cc_file.rindex('/') + 1:], shared_includes_content)
-        h_2_cc.h2cc()
-
-
-def gen_code(inc_dir, out_cc_dir):
-    """
-    :param inc_dir:
-    :param out_cc_dir:
-    :return:
-    """
-    if not inc_dir.endswith('/'):
-        inc_dir += '/'
-    if not out_cc_dir.endswith('/'):
-        out_cc_dir += '/'
-    for include_dir_key_word in include_dir_key_words:
-        generate_stub_file(inc_dir + include_dir_key_word, out_cc_dir)
-
-
-if __name__ == '__main__':
-    inc_dir = sys.argv[1]
-    out_cc_dir = sys.argv[2]
-    gen_code(inc_dir, out_cc_dir)
diff --git a/src/common/graph/tensor.cc b/src/common/graph/tensor.cc
index 1f30c876..0d511645 100644
--- a/src/common/graph/tensor.cc
+++ b/src/common/graph/tensor.cc
@@ -178,18 +178,16 @@ int64_t Shape::GetShapeSize() const {
   return 0;
 }
 
-TensorDesc::TensorDesc() {
-  impl = ComGraphMakeShared<TensorDescImpl>();  // lint !e665
-}
+TensorDesc::TensorDesc() { impl = ComGraphMakeShared<TensorDescImpl>(); }
 
 TensorDesc::TensorDesc(Shape shape, Format format, DataType dt) {
-  impl = ComGraphMakeShared<TensorDescImpl>(shape, format, dt);  // lint !e665
+  impl = ComGraphMakeShared<TensorDescImpl>(shape, format, dt);
   SetRealDimCnt(shape.GetDimNum());
 }
 
 TensorDesc::TensorDesc(const TensorDesc &desc) {
   // Copy
-  impl = ComGraphMakeShared<TensorDescImpl>();  // lint !e665
+  impl = ComGraphMakeShared<TensorDescImpl>();
   if (desc.impl != nullptr && impl != nullptr) {
     *impl = *desc.impl;
   }
@@ -360,9 +358,7 @@ void TensorDesc::SetName(const std::string &name) {
 
 Tensor::Tensor() { impl = ComGraphMakeShared<TensorImpl>(); }
 
-Tensor::Tensor(const TensorDesc &tensor_desc) {
-  impl = ComGraphMakeShared<TensorImpl>(tensor_desc);  // lint !e665
-}
+Tensor::Tensor(const TensorDesc &tensor_desc) { impl = ComGraphMakeShared<TensorImpl>(tensor_desc); }
 
 Tensor::Tensor(const TensorDesc &tensor_desc, const std::vector<uint8_t> &data) {
   uint64_t shape_size = tensor_desc.GetShape().GetShapeSize();
@@ -384,7 +380,7 @@ Tensor::Tensor(const TensorDesc &tensor_desc, const std::vector<uint8_t> &data)
       }
     }
   }
-  impl = ComGraphMakeShared<TensorImpl>(tensor_desc, data);  // lint !e665
+  impl = ComGraphMakeShared<TensorImpl>(tensor_desc, data);
 }
 
 Tensor::Tensor(const TensorDesc &tensor_desc, const uint8_t *data, size_t size) {
@@ -406,7 +402,7 @@ Tensor::Tensor(const TensorDesc &tensor_desc, const uint8_t *data, size_t size)
     }
   }
 
-  impl = ComGraphMakeShared<TensorImpl>(tensor_desc, data, size);  // lint !e665
+  impl = ComGraphMakeShared<TensorImpl>(tensor_desc, data, size);
 }
 
 Tensor::Tensor(TensorDesc &&tensor_desc, std::vector<uint8_t> &&data) {
@@ -429,7 +425,7 @@ Tensor::Tensor(TensorDesc &&tensor_desc, std::vector<uint8_t> &&data) {
       }
     }
   }
-  impl = ComGraphMakeShared<TensorImpl>(std::move(tensor_desc), std::move(data));  // lint !e665
+  impl = ComGraphMakeShared<TensorImpl>(std::move(tensor_desc), std::move(data));
 }
 
 TensorDesc Tensor::GetTensorDesc() const {
@@ -643,7 +639,7 @@ TensorDesc TensorAdapter::GeTensorDesc2TensorDesc(const GeTensorDesc &ge_tensor_
 GeTensorPtr TensorAdapter::Tensor2GeTensor(const Tensor &tensor) {
   GeTensorPtr ge_tensor;
   if (tensor.impl != nullptr) {
-    ge_tensor = ComGraphMakeShared<GeTensor>(tensor.impl->ge_tensor.Clone());  // lint !e665
+    ge_tensor = ComGraphMakeShared<GeTensor>(tensor.impl->ge_tensor.Clone());
   }
   return ge_tensor;
 }
@@ -659,7 +655,7 @@ Tensor TensorAdapter::GeTensor2Tensor(const ConstGeTensorPtr &ge_tensor) {
 ConstGeTensorPtr TensorAdapter::AsGeTensorPtr(const Tensor &tensor) {
   GeTensorPtr ge_tensor;
   if (tensor.impl != nullptr) {
-    ge_tensor = ComGraphMakeShared<GeTensor>(tensor.impl->ge_tensor);  // lint !e665
+    ge_tensor = ComGraphMakeShared<GeTensor>(tensor.impl->ge_tensor);
   }
   return ge_tensor;
 }
@@ -667,7 +663,7 @@ ConstGeTensorPtr TensorAdapter::AsGeTensorPtr(const Tensor &tensor) {
 GeTensorPtr TensorAdapter::AsGeTensorPtr(Tensor &tensor) {
   GeTensorPtr ge_tensor;
   if (tensor.impl != nullptr) {
-    ge_tensor = ComGraphMakeShared<GeTensor>(tensor.impl->ge_tensor);  // lint !e665
+    ge_tensor = ComGraphMakeShared<GeTensor>(tensor.impl->ge_tensor);
   }
   return ge_tensor;
 }
diff --git a/src/common/graph/utils/graph_utils.cc b/src/common/graph/utils/graph_utils.cc
index c741a316..86c71f23 100644
--- a/src/common/graph/utils/graph_utils.cc
+++ b/src/common/graph/utils/graph_utils.cc
@@ -58,8 +58,10 @@ namespace {
 const int32_t kBaseOfIntegerValue = 10;
 #ifdef FMK_SUPPORT_DUMP
 const char *const kDumpGeGraph = "DUMP_GE_GRAPH";
-const int kDumpGraphIndexWidth = 5;
+const int kDumpGraphIndexWidth = 8;
 #endif
+
+const char *const kDumpGraphPath = "DUMP_GRAPH_PATH";
 const char *const kDumpGraphLevel = "DUMP_GRAPH_LEVEL";
 const char *const kDumpStrBuild = "Build";
 const char *const kDumpStrPartition = "partition";
@@ -588,6 +590,11 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(cons
   }
 
   std::stringstream stream_file_name;
+  char *dump_graph_path = std::getenv(kDumpGraphPath);
+  if (dump_graph_path != nullptr) {
+    std::string dump_graph_path_str(dump_graph_path);
+    stream_file_name << (dump_graph_path_str.empty() ? "" : dump_graph_path_str + "/");
+  }
   stream_file_name << "ge_proto_" << std::setw(kDumpGraphIndexWidth) << std::setfill('0') << file_index;
   stream_file_name << "_" << suffix << ".txt";
   std::string proto_file = user_graph_name.empty() ? stream_file_name.str() : user_graph_name;
@@ -598,7 +605,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(cons
   Buffer buffer;
   const int64_t kDumpLevel =
     (dump_ge_graph != nullptr) ? std::strtol(dump_ge_graph, nullptr, kBaseOfIntegerValue) : ge::OnnxUtils::NO_DUMP;
-  model.Save(buffer, kDumpLevel != ge::OnnxUtils::DUMP_ALL);
+  model.Save(buffer, kDumpLevel != ge::OnnxUtils::DUMP_ALL && !is_always_dump);
 
   // Write file
   ge::proto::ModelDef ge_proto;
@@ -620,6 +627,54 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(cons
 #endif
 }
 
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGrph(const ge::ComputeGraphPtr &graph,
+                                                                           const std::string &path,
+                                                                           const std::string &suffix) {
+  // file name
+  static std::atomic_long atomic_file_index(0);
+  auto file_index = atomic_file_index.fetch_add(1);
+  GELOGD("Start to dump om txt: %ld", file_index);
+
+  thread_local long max_dump_file_num = 0;
+  if (max_dump_file_num == 0) {
+    string opt = "0";
+    (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_NUM, opt);
+    max_dump_file_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
+  }
+  if (max_dump_file_num != 0 && file_index > max_dump_file_num) {
+    GELOGW("Dump graph file cnt > maxDumpFileNum, maxDumpFileCnt=%ld.", max_dump_file_num);
+    return;
+  }
+
+  std::stringstream stream_file_name;
+  stream_file_name << path.c_str() << "/ge_proto_" << std::setw(5) << std::setfill('0') << file_index;
+  stream_file_name << "_" << suffix << ".txt";
+  std::string proto_file = stream_file_name.str();
+
+  // Create buffer
+  ge::Model model("", "");
+  model.SetGraph(GraphUtils::CreateGraphFromComputeGraph(std::const_pointer_cast<ComputeGraph>(graph)));
+  Buffer buffer;
+  const int64_t kDumpLevel = ge::OnnxUtils::NO_DUMP;
+  model.Save(buffer, kDumpLevel != ge::OnnxUtils::DUMP_ALL);
+
+  // Write file
+  ge::proto::ModelDef ge_proto;
+  if (buffer.GetData() != nullptr) {
+    std::string str(reinterpret_cast<const char *>(buffer.GetData()), buffer.GetSize());
+    if (!ge_proto.ParseFromString(str)) {
+      GELOGE(GRAPH_FAILED, "parse from string failed.");
+      return;
+    }
+    char real_path[PATH_MAX] = {0x00};
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(strlen(proto_file.c_str()) >= PATH_MAX, return, "file path is too longer!");
+    GE_IF_BOOL_EXEC(realpath(proto_file.c_str(), real_path) == nullptr,
+                    GELOGI("file %s does not exist, it will be created.", proto_file.c_str()));
+
+    GraphUtils::WriteProtoToTextFile(ge_proto, real_path);
+  }
+}
+
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::LoadGEGraph(const char *file,
                                                                             ge::ComputeGraph &compute_graph) {
   ge::proto::ModelDef model_def;
@@ -722,7 +777,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToText
   }
   GE_CHK_BOOL_EXEC(fclose(file) == 0, return, "Fclose fileoutputstream failed");
 #else
-  GELOGW("need to define FMK_SUPPORT_DUMP for dump graph.");
+  GELOGW("Need to define FMK_SUPPORT_DUMP for dump graph.");
 #endif
 }
 
@@ -789,6 +844,11 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraphToOnn
   }
 
   std::stringstream stream_file_name;
+  char *dump_graph_path = std::getenv(kDumpGraphPath);
+  if (dump_graph_path != nullptr) {
+    std::string dump_graph_path_str(dump_graph_path);
+    stream_file_name << (dump_graph_path_str.empty() ? "" : dump_graph_path_str + "/");
+  }
   stream_file_name << "ge_onnx_" << std::setw(kDumpGraphIndexWidth) << std::setfill('0') << file_index;
   stream_file_name << "_graph_" << compute_graph.GetGraphID();
   stream_file_name << "_" << suffix << ".pbtxt";
@@ -822,6 +882,66 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraphToOnn
 #endif
 }
 
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGrphToOnnx(const ge::ComputeGraph &compute_graph,
+                                                                               const std::string &path,
+                                                                               const std::string &suffix) {
+  // 1.Get ge::onnx::ModelProto from ge::Model
+  ge::Model model("GE", "");
+  std::shared_ptr<ge::ComputeGraph> compute_graph_ptr = ComGraphMakeShared<ge::ComputeGraph>(compute_graph);
+  model.SetGraph(GraphUtils::CreateGraphFromComputeGraph(std::const_pointer_cast<ComputeGraph>(compute_graph_ptr)));
+  onnx::ModelProto model_proto;
+  if (!OnnxUtils::ConvertGeModelToModelProto(model, model_proto)) {
+    GELOGE(GRAPH_FAILED, "DumpGEGraphToOnnx failed.");
+    return;
+  }
+
+  // 2.Set file name
+  static std::atomic_long atomic_file_index(0);
+  auto file_index = atomic_file_index.fetch_add(1);
+  GELOGD("Start to dump ge onnx file: %ld", file_index);
+
+  thread_local long max_dump_file_num = 0;
+  if (max_dump_file_num == 0) {
+    string opt = "0";
+    (void)GetContext().GetOption(OPTION_GE_MAX_DUMP_FILE_NUM, opt);
+    max_dump_file_num = std::strtol(opt.c_str(), nullptr, kBaseOfIntegerValue);
+  }
+  if (max_dump_file_num != 0 && file_index > max_dump_file_num) {
+    GELOGW("Dump graph file cnt > maxDumpFileNum, maxDumpFileNum=%ld.", max_dump_file_num);
+    return;
+  }
+
+  std::stringstream stream_file_name;
+  stream_file_name << path.c_str() << "/ge_onnx_" << std::setw(5) << std::setfill('0') << file_index;
+  stream_file_name << "_graph_" << compute_graph.GetGraphID();
+  stream_file_name << "_" << suffix << ".pbtxt";
+  std::string proto_file = stream_file_name.str();
+  if ((proto_file.length()) >= NAME_MAX) {
+    GELOGE(GRAPH_FAILED, "File name is too longer!");
+    return;
+  }
+  std::unique_ptr<char[]> real_path(new (std::nothrow) char[PATH_MAX]{0});
+  if (real_path == nullptr) {
+    GELOGE(GRAPH_FAILED, "New real_path failed.");
+    return;
+  }
+  /// Returning nullptr means 3 case as follows:
+  /// a.path is PATH_MAX chars or more
+  /// b.the file does not exist
+  /// c.the path has no permissions
+  /// Distinguish between last the two cases in the function WriteProtoToTextFile call open()
+  if (realpath(proto_file.c_str(), real_path.get()) == nullptr) {
+    // For case a
+    if (errno == ENAMETOOLONG) {
+      GELOGE(GRAPH_FAILED, "Call realpath failed: path is PATH_MAX chars or more.");
+      return;
+    }
+  }
+
+  // 3. Serialize to file in current path
+  GraphUtils::WriteProtoToTextFile(model_proto, real_path.get());
+}
+
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::LoadGEGraphFromOnnx(const char *file,
                                                                                     ge::ComputeGraph &compute_graph) {
   if (file == nullptr) {
@@ -1419,7 +1539,7 @@ GraphUtils::CloneGraph(const ComputeGraphPtr &graph, const std::string &prefix,
       return nullptr;
     }
 
-    op_desc->SetName(prefix + n->GetName());
+    op_desc->SetName(n->GetName() + prefix);
     NodePtr node = new_graph->AddNode(op_desc);
     GE_CHK_BOOL_EXEC(node != nullptr, return nullptr, "Add node[%s] to graph failed", op_desc->GetName().c_str());
     all_new_nodes[node->GetName()] = node;
@@ -1445,6 +1565,17 @@ GraphUtils::CloneGraph(const ComputeGraphPtr &graph, const std::string &prefix,
       return nullptr;
     }
   }
+
+  // copy info of output nodes from old graph to new graph.
+  std::vector<std::pair<NodePtr, int32_t>> out_nodes_info = graph->GetGraphOutNodesInfo();
+  std::vector<std::pair<NodePtr, int32_t>> new_out_nodes_info;
+  for (const auto &info : out_nodes_info) {
+    auto it = all_new_nodes.find(info.first->GetName());
+    if (it != all_new_nodes.end()) {
+      new_out_nodes_info.emplace_back(it->second, info.second);
+    }
+  }
+  new_graph->SetGraphOutNodesInfo(new_out_nodes_info);
   return new_graph;
 }
 
@@ -1501,7 +1632,7 @@ graphStatus GraphUtils::RelinkGraphEdges(const NodePtr &node, const string &pref
     return GRAPH_FAILED;
   }
 
-  auto it = all_nodes.find(prefix + node->GetName());
+  auto it = all_nodes.find(node->GetName() + prefix);
   if (it == all_nodes.end()) {
     GELOGE(GRAPH_FAILED, "node[%s] not found", node->GetName().c_str());
     return GRAPH_FAILED;
@@ -1517,7 +1648,7 @@ graphStatus GraphUtils::RelinkGraphEdges(const NodePtr &node, const string &pref
     }
     GE_CHK_BOOL_EXEC(out_anchor->GetOwnerNode() != nullptr, return GRAPH_FAILED, "Peer out node is null");
 
-    it = all_nodes.find(prefix + out_anchor->GetOwnerNode()->GetName());
+    it = all_nodes.find(out_anchor->GetOwnerNode()->GetName() + prefix);
     if (it == all_nodes.end()) {
       GELOGE(GRAPH_FAILED, "node[%s] not found", out_anchor->GetOwnerNode()->GetName().c_str());
       return GRAPH_FAILED;
@@ -1535,7 +1666,7 @@ graphStatus GraphUtils::RelinkGraphEdges(const NodePtr &node, const string &pref
       GE_CHK_BOOL_EXEC(out_anchor != nullptr, continue, "Peer out anchor is null: %s", node->GetName().c_str());
       GE_CHK_BOOL_EXEC(out_anchor->GetOwnerNode() != nullptr, return GRAPH_FAILED, "Peer out node is null");
 
-      it = all_nodes.find(prefix + out_anchor->GetOwnerNode()->GetName());
+      it = all_nodes.find(out_anchor->GetOwnerNode()->GetName() + prefix);
       if (it == all_nodes.end()) {
         GELOGE(GRAPH_FAILED, "node[%s] not found", out_anchor->GetOwnerNode()->GetName().c_str());
         return GRAPH_FAILED;
@@ -1736,7 +1867,7 @@ graphStatus GraphUtils::HandleMergeInput(const NodePtr &node,
       if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_NEXT_ITERATION, next_name) && !next_name.empty()) {
         ComputeGraphPtr graph = node->GetOwnerComputeGraph();
         GE_CHECK_NOTNULL(graph);
-        ge::NodePtr next_node = graph->FindNode(next_name);
+        ge::NodePtr next_node = FindNodeFromAllNodes(graph, next_name);
         GE_CHECK_NOTNULL(next_node);
         // NextIteration has and only has one output
         peer_out_anchor = next_node->GetOutDataAnchor(0);
@@ -2332,15 +2463,12 @@ CompleteGraphBuilder &CompleteGraphBuilder::SetOutputMapping(const std::map<uint
 ///
 ComputeGraphPtr CompleteGraphBuilder::Build(graphStatus &error_code, std::string &error_msg) {
   owner_graph_ = shared_ptr<ComputeGraph>(new (std::nothrow) ComputeGraph(name_));
-  if ((owner_graph_ == nullptr) || (parent_node_ == nullptr)) {
+  if (owner_graph_ == nullptr) {
     error_code = GRAPH_FAILED;
-    error_msg = "graph / parent_node is NULL.";
+    error_msg = "graph is NULL.";
     return nullptr;
   }
 
-  owner_graph_->SetParentNode(parent_node_);
-  owner_graph_->SetParentGraph(parent_node_->GetOwnerComputeGraph());
-
   BuildNodes(error_code, error_msg);
   if (error_code != GRAPH_SUCCESS) {
     return nullptr;
@@ -2361,37 +2489,27 @@ ComputeGraphPtr CompleteGraphBuilder::Build(graphStatus &error_code, std::string
     return nullptr;
   }
 
-  AddRetValNodes(error_code, error_msg);
-  if (error_code != GRAPH_SUCCESS) {
-    return nullptr;
+  if (retval_flag_) {
+    AddRetValNodes(error_code, error_msg);
+    if (error_code != GRAPH_SUCCESS) {
+      return nullptr;
+    }
+    BuildGraphTargets(error_code, error_msg);
+    if (error_code != GRAPH_SUCCESS) {
+      return nullptr;
+    }
+  } else {
+    AddNetOutputNode(error_code, error_msg);
+    if (error_code != GRAPH_SUCCESS) {
+      return nullptr;
+    }
   }
 
-  BuildGraphTargets(error_code, error_msg);
+  PostProcess(error_code, error_msg);
   if (error_code != GRAPH_SUCCESS) {
     return nullptr;
   }
 
-  // ATTR_NAME_SESSION_GRAPH_ID
-  std::string graph_id;
-  if (!AttrUtils::GetStr(parent_node_->GetOwnerComputeGraph(), ATTR_NAME_SESSION_GRAPH_ID, graph_id)) {
-    error_code = GRAPH_FAILED;
-    error_msg = "Get attr session_graph_id failed.";
-    return nullptr;
-  }
-  if (!AttrUtils::SetStr(owner_graph_, ATTR_NAME_SESSION_GRAPH_ID, graph_id)) {
-    error_code = GRAPH_FAILED;
-    error_msg = "Set attr session_graph_id failed.";
-    return nullptr;
-  }
-
-  // refresh node name
-  for (const NodePtr &node : owner_graph_->GetDirectNode()) {
-    if ((node->GetOpDesc() == nullptr) || (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2)) {
-      continue;
-    }
-    node->GetOpDesc()->SetName(owner_graph_->GetName() + "/" + node->GetName());
-  }
-
   return owner_graph_;
 }
 
@@ -2586,7 +2704,144 @@ void CompleteGraphBuilder::BuildGraphTargets(graphStatus &error_code, std::strin
     target_nodes.emplace_back(target_iter->second);
   }
   owner_graph_->SetGraphTargetNodesInfo(target_nodes);
-  return;
+}
+
+///
+/// @brief Add NetOutput node
+/// @param [out] error_code
+/// @param [out] error_msg
+/// @return void
+///
+void CompleteGraphBuilder::AddNetOutputNode(graphStatus &error_code, std::string &error_msg) {
+  std::string log_msg = "AddNetOutputNode name:" + std::string(NODE_NAME_NET_OUTPUT) + ", type:" + NETOUTPUT;
+  OpDescPtr net_output_desc = shared_ptr<OpDesc>(new (std::nothrow) OpDesc(NODE_NAME_NET_OUTPUT, NETOUTPUT));
+  if (net_output_desc == nullptr) {
+    error_code = GRAPH_FAILED;
+    error_msg = log_msg + " failed: op_desc is NULL.";
+    return;
+  }
+
+  size_t output_num = graph_outputs_.size();
+  std::vector<OutDataAnchorPtr> peer_out_anchors(output_num);
+  for (size_t i = 0; i < output_num; i++) {
+    int32_t index = graph_outputs_[i].second;
+    auto out_iter = node_names_.find(graph_outputs_[i].first);
+    if (out_iter == node_names_.end()) {
+      error_code = GRAPH_FAILED;
+      error_msg = "AddNetOutputNode failed: node " + graph_outputs_[i].first + " not exist in graph.";
+      return;
+    }
+    NodePtr node = out_iter->second;
+    if ((node == nullptr) || (node->GetOpDesc() == nullptr)) {
+      error_code = GRAPH_FAILED;
+      error_msg = "AddNetOutputNode failed: node is NULL.";
+      return;
+    }
+
+    ge::GeTensorDesc tensor = node->GetOpDesc()->GetOutputDesc(index);
+    uint32_t update_index = i;
+    auto iter = output_mapping_.find(i);
+    if (iter != output_mapping_.end()) {
+      update_index = iter->second;
+    }
+    if (!ge::AttrUtils::SetInt(tensor, ATTR_NAME_PARENT_NODE_INDEX, update_index)) {
+      error_code = GRAPH_FAILED;
+      error_msg = "AddNetOutputNode failed: set attr PARENT_NODE_INDEX failed.";
+      return;
+    }
+    if (net_output_desc->AddInputDesc(tensor) != GRAPH_SUCCESS) {
+      error_code = GRAPH_FAILED;
+      error_msg = "AddNetOutputNode failed: add input_desc ailed.";
+      return;
+    }
+    peer_out_anchors[i] = node->GetOutDataAnchor(index);
+  }
+
+  BuildNetOutputNodeWithLink(net_output_desc, peer_out_anchors, error_code, error_msg);
+  if (error_code != GRAPH_SUCCESS) {
+    return;
+  }
+
+  GELOGD("%s succ.", log_msg.c_str());
+}
+
+///
+/// @brief Build NetOutput nodes with data & ctrl edges
+/// @param [in] net_output_desc
+/// @param [in] peer_out_anchors
+/// @param [out] error_code
+/// @param [out] error_msg
+/// @return void
+///
+void CompleteGraphBuilder::BuildNetOutputNodeWithLink(const OpDescPtr &net_output_desc,
+                                                      const std::vector<OutDataAnchorPtr> &peer_out_anchors,
+                                                      graphStatus &error_code, std::string &error_msg) {
+  std::string log_msg = "AddNetOutputNode name:" + std::string(NODE_NAME_NET_OUTPUT) + ", type:" + NETOUTPUT;
+  NodePtr net_output = owner_graph_->AddNode(net_output_desc);
+  if (net_output == nullptr) {
+    error_code = GRAPH_FAILED;
+    error_msg = log_msg + " failed: add NetOutput node failed.";
+    return;
+  }
+
+  size_t output_num = graph_outputs_.size();
+  for (size_t i = 0; i < output_num; i++) {
+    if (GraphUtils::AddEdge(peer_out_anchors[i], net_output->GetInDataAnchor(i)) != GRAPH_SUCCESS) {
+      error_code = GRAPH_FAILED;
+      error_msg = "AddNetOutputNode failed: add data-edge " + peer_out_anchors[i]->GetOwnerNode()->GetName() + ":" +
+                  std::to_string(peer_out_anchors[i]->GetIdx()) + "->" + NODE_NAME_NET_OUTPUT + ":" +
+                  std::to_string(i) + " failed.";
+      return;
+    }
+  }
+  for (const std::string &target_name : graph_targets_) {
+    auto target_iter = node_names_.find(target_name);
+    if ((target_iter == node_names_.end()) || (target_iter->second == nullptr)) {
+      error_code = GRAPH_FAILED;
+      error_msg = "BuildGraphTargets failed: target_node " + target_name + " not exist in graph.";
+      return;
+    }
+    const auto &target_node = target_iter->second;
+    if (GraphUtils::AddEdge(target_node->GetOutControlAnchor(), net_output->GetInControlAnchor()) != GRAPH_SUCCESS) {
+      error_code = GRAPH_FAILED;
+      error_msg =
+        "AddNetOutputNode failed: add ctrl-edge " + target_node->GetName() + "->" + NODE_NAME_NET_OUTPUT + " failed.";
+      return;
+    }
+  }
+}
+
+///
+/// @brief process after build
+/// @param [out] error_code
+/// @param [out] error_msg
+/// @return void
+///
+void CompleteGraphBuilder::PostProcess(graphStatus &error_code, std::string &error_msg) {
+  if (parent_node_ != nullptr) {
+    owner_graph_->SetParentNode(parent_node_);
+    owner_graph_->SetParentGraph(parent_node_->GetOwnerComputeGraph());
+    // ATTR_NAME_SESSION_GRAPH_ID
+    std::string graph_id;
+    if (!AttrUtils::GetStr(parent_node_->GetOwnerComputeGraph(), ATTR_NAME_SESSION_GRAPH_ID, graph_id)) {
+      error_code = GRAPH_FAILED;
+      error_msg = "Get attr session_graph_id failed.";
+      return;
+    }
+    if (!AttrUtils::SetStr(owner_graph_, ATTR_NAME_SESSION_GRAPH_ID, graph_id)) {
+      error_code = GRAPH_FAILED;
+      error_msg = "Set attr session_graph_id failed.";
+      return;
+    }
+  }
+
+  // refresh node name
+  for (const NodePtr &node : owner_graph_->GetDirectNode()) {
+    if ((node->GetOpDesc() == nullptr) || (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2)) {
+      continue;
+    }
+    node->GetOpDesc()->SetName(owner_graph_->GetName() + "/" + node->GetName());
+  }
 }
 
 ///
diff --git a/src/common/graph/utils/node_utils.cc b/src/common/graph/utils/node_utils.cc
index 684e37ac..65ff6cc4 100644
--- a/src/common/graph/utils/node_utils.cc
+++ b/src/common/graph/utils/node_utils.cc
@@ -391,7 +391,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus NodeUtils::AppendInpu
       GELOGE(GRAPH_FAILED, "Add input desc failed");
       return GRAPH_FAILED;
     }
+  }
 
+  for (size_t i = node->in_data_anchors_.size(); i < num; ++i) {
     auto anchor = ComGraphMakeShared<InDataAnchor>(node, i);
     if (anchor == nullptr) {
       GELOGE(OUT_OF_MEMORY, "Current in data anchor is null, make shared_ptr failed.");
@@ -444,7 +446,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus NodeUtils::AppendOutp
       GELOGE(GRAPH_FAILED, "Add output desc failed");
       return GRAPH_FAILED;
     }
+  }
 
+  for (size_t i = node->out_data_anchors_.size(); i < num; ++i) {
     auto anchor = ComGraphMakeShared<OutDataAnchor>(node, i);
     if (anchor == nullptr) {
       GELOGE(OUT_OF_MEMORY, "Current out data anchor is null, make shared_ptr failed.");
@@ -644,6 +648,20 @@ std::string NodeUtils::GetNodeType(const Node &node) {
 
 std::string NodeUtils::GetNodeType(const NodePtr &node) { return node == nullptr ? "" : GetNodeType(*node); }
 
+std::vector<ComputeGraphPtr> NodeUtils::GetAllSubgraphs(const Node &node) {
+  auto op_desc = node.GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(GRAPH_FAILED, "Failed to get op desc from node %s ", node.GetName().c_str());
+    return {};
+  }
+  auto root_graph = GraphUtils::FindRootGraph(node.GetOwnerComputeGraph());
+  if (root_graph == nullptr) {
+    GELOGE(GRAPH_FAILED, "Failed to find root graph from node %s ", node.GetName().c_str());
+    return {};
+  }
+  return root_graph->GetAllSubgraphs();
+}
+
 ComputeGraphPtr NodeUtils::GetSubgraph(const Node &node, uint32_t index) {
   auto op_desc = node.GetOpDesc();
   if (op_desc == nullptr) {
@@ -1002,4 +1020,23 @@ vector<pair<InDataAnchorPtr, NodePtr>> NodeUtils::GetOutDataNodesWithAnchorByInd
 }
 
 ConstNodePtr NodeUtils::GetNodeFromOperator(const Operator &oprt) { return oprt.GetNode(); }
+
+std::string NodeUtils::GetInConstNodeTypeCrossSubgraph(const NodePtr &node) {
+  NodePtr input_node = node;
+  while (input_node != nullptr) {
+    if (input_node->GetType() != DATA) {
+      return input_node->GetType();
+    }
+
+    auto owner_graph = input_node->GetOwnerComputeGraph();
+    auto parent_node = owner_graph->GetParentNode();
+    if ((parent_node == nullptr) || (kWhileOpTypes.count(parent_node->GetType()) > 0)) {
+      return node->GetType();  // not in subgraph or while subgraph.
+    }
+
+    input_node = GetParentInput(input_node);
+  }
+
+  return "";
+}
 }  // namespace ge
diff --git a/src/common/graph/utils/op_desc_utils.cc b/src/common/graph/utils/op_desc_utils.cc
index 17c80b2c..f9cfb901 100644
--- a/src/common/graph/utils/op_desc_utils.cc
+++ b/src/common/graph/utils/op_desc_utils.cc
@@ -28,7 +28,6 @@
 
 using std::vector;
 
-/*lint -e512 -e737 -e752*/
 namespace ge {
 const char OP_DESC_QUANT_PARAMS[] = "quantize_factor";
 static const int CONST_OP_NORMAL_WEIGHT_SIZE = 1;
@@ -133,11 +132,11 @@ graphStatus OpDescUtils::GetQuantizeFactorParams(const OpDesc &op_desc, Quantize
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 OpDescUtils::SetQuantizeFactorParams(const OpDescPtr &op_desc, const QuantizeFactorParams &quant) {
   GE_CHK_BOOL_EXEC_INFO(op_desc != nullptr, return GRAPH_FAILED, "op_desc is nullptr");
-  return op_desc->SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));  // lint !e732
+  return op_desc->SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));
 }
 
 graphStatus OpDescUtils::SetQuantizeFactorParams(OpDesc &op_desc, const QuantizeFactorParams &quant) {
-  return op_desc.SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));  // lint !e732
+  return op_desc.SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));
 }
 
 GeTensorPtr OpDescUtils::MutableWeights(OpDesc &op_desc) {
@@ -255,7 +254,7 @@ size_t OpDescUtils::GetNonConstInputsSize(const ge::Node &node) {
         continue;
       }
     }
-    return input_num;  // lint !e712
+    return input_num;
   } else {
     GE_IF_BOOL_EXEC(
       node.GetInDataNodes().size() < GetConstInputs(node).size(),
@@ -360,7 +359,7 @@ bool OpDescUtils::IsNonConstInput(const ge::Node &node, const size_t index) {
   bool ret = false;
   if (index < node.GetAllInDataAnchors().size()) {
     if (NodeUtils::IsAnchorStatusSet(node)) {
-      ret = (ge::AnchorUtils::GetStatus(node.GetInDataAnchor(static_cast<int>(index))) == ANCHOR_DATA);  // lint !e712
+      ret = (ge::AnchorUtils::GetStatus(node.GetInDataAnchor(static_cast<int>(index))) == ANCHOR_DATA);
     } else {
       for (const auto &anchor : node.GetAllInDataAnchors()) {
         if (anchor->GetIdx() != static_cast<int>(index)) {
@@ -822,4 +821,3 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDescUtils::SetSubgr
   return op_desc->SetSubgraphInstanceName(iter->second, subgraph_instance_name);
 }
 }  // namespace ge
-/*lint +e512 +e737 +e752*/
diff --git a/src/common/graph/utils/tuning_utils.cc b/src/common/graph/utils/tuning_utils.cc
index 0f07a197..a5a15562 100644
--- a/src/common/graph/utils/tuning_utils.cc
+++ b/src/common/graph/utils/tuning_utils.cc
@@ -17,8 +17,10 @@
 #include "graph/tuning_utils.h"
 #include "../debug/ge_util.h"
 #include "../debug/ge_op_types.h"
+#include "framework/common/scope_guard.h"
 
 namespace ge {
+namespace {
 const std::string peer_node_name_attr = "_peerNodeName";
 const std::string parent_node_name_attr = "_parentNodeName";
 const std::string alias_name_attr = "_aliasName";
@@ -28,6 +30,7 @@ const std::string tuning_subgraph_prefix = "/aicore_subgraph_";
 const std::string non_tuning_subgraph_prefix = "/subgraph_";
 const std::set<std::string> kPartitionOpTypes = {PLACEHOLDER, END};
 const std::set<std::string> kExeTypes = {DATA, NETOUTPUT};
+}  // namespace
 NodeNametoNodeNameMap TuningUtils::data_2_netoutput_;
 NodetoNodeNameMap TuningUtils::data_node_2_netoutput_;
 NodetoNodeMap TuningUtils::data_node_2_netoutput_node_;
@@ -116,6 +119,10 @@ graphStatus TuningUtils::ConvertGraphToFile(std::vector<ComputeGraphPtr> tuning_
 // +---------------+
 graphStatus TuningUtils::MakeExeGraph(ComputeGraphPtr &exe_graph, const HelpInfo &help_info) {
   GE_CHECK_NOTNULL(exe_graph);
+
+  // clear graph id
+  GELOGI("TUU:clear [%s] session_graph_id %s", exe_graph->GetName().c_str(),
+         (AttrUtils::SetStr(*exe_graph, ATTR_NAME_SESSION_GRAPH_ID, "") ? "success" : "not success"));
   // if not make exe, just dump and return
   if (!help_info.exe_flag) {
     DumpGraphToPath(exe_graph, help_info.index, help_info.is_tuning_graph, help_info.path);
@@ -346,7 +353,9 @@ graphStatus TuningUtils::LinkEnd2NetOutput(NodePtr &end_node, NodePtr &out_node)
   AnchorPtr end_in_anchor = (end_node->GetInDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
                               ? Anchor::DynamicAnchorCast<Anchor>(end_node->GetInControlAnchor())
                               : Anchor::DynamicAnchorCast<Anchor>(end_node->GetInDataAnchor(0));
+  GE_CHECK_NOTNULL(end_in_anchor);
   auto src_anchor = end_in_anchor->GetFirstPeerAnchor();  // src_anchor should be only 1
+  GE_CHECK_NOTNULL(src_anchor);
   if (GraphUtils::RemoveEdge(src_anchor, end_in_anchor) != GRAPH_SUCCESS) {
     GELOGE(FAILED, "TUU:remove end input edge from from %s(%d) to %s(%d) failed. node_name:%s, graph_name:%s",
            GetNodeNameByAnchor(src_anchor.get()).c_str(), src_anchor->GetIdx(),
@@ -447,6 +456,14 @@ graphStatus TuningUtils::HandleEnd(NodePtr &node) {
 
 // part 2
 graphStatus TuningUtils::ConvertFileToGraph(const map<int64_t, string> &options, ge::Graph &graph) {
+  std::function<void()> callback = [&]() {
+    data_2_netoutput_.clear();
+    data_node_2_netoutput_.clear();
+    data_node_2_netoutput_node_.clear();
+    netoutput_nodes_.clear();
+    merged_graph_nodes_.clear();
+  };
+  GE_MAKE_GUARD(release, callback);
   // 1. get all subgraph object
   std::vector<ComputeGraphPtr> graphs;
   // options format like {index:"subgraph_path"}
@@ -666,7 +683,9 @@ graphStatus TuningUtils::GetInAndOutAnchorPair(NodePtr &data_node, NodePtr &out_
       GE_CHECK_NOTNULL(src_anchor);
       auto src_node = src_anchor->GetOwnerNode();
       GE_CHECK_NOTNULL(src_node);
-      if (src_node->GetName() == netoutput_input_name && src_anchor->GetIdx() == parent_node_anchor_index) {
+      std::string src_node_name = src_node->GetName();
+      if (src_node_name.find(netoutput_input_name) != src_node_name.npos &&
+          src_anchor->GetIdx() == parent_node_anchor_index) {
         dest_in_anchor = in_anchor;
         src_out_anchor = src_anchor;
         GELOGD("TUU:get out node:%s 's in anchor(%d) src_node:%s 's out anchor(%d) related with data node:%s",
diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt
index 3f4f1a8b..8ed405e9 100755
--- a/src/ge/CMakeLists.txt
+++ b/src/ge/CMakeLists.txt
@@ -39,7 +39,7 @@ ge_protobuf_generate(ge PROTO_HEADER_SRCS PROTO_HEADER_HDRS ${PROTO_HEADER_LIST}
 # include directories
 include_directories(${CMAKE_CURRENT_LIST_DIR})
 include_directories(${GE_SOURCE_DIR})
-include_directories(${GE_SOURCE_DIR}/src)
+include_directories(${GE_SOURCE_DIR}/src/ge)
 include_directories(${GE_SOURCE_DIR}/src/ge/analyzer)
 include_directories(${GE_SOURCE_DIR}/inc)
 include_directories(${GE_SOURCE_DIR}/inc/common/util)
@@ -109,6 +109,8 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/manager/graph_mem_allocator.cc"
         "graph/manager/graph_caching_allocator.cc"
         "graph/manager/graph_var_manager.cc"
+        "graph/manager/host_mem_manager.cc"
+        "graph/manager/memory_api.cc"
         "graph/manager/model_manager/event_manager.cc"
         "graph/manager/rdma_pool_allocator.cc"
         "graph/manager/trans_var_data_utils.cc"
@@ -127,6 +129,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/partition/dynamic_shape_partition.cc"
         "graph/partition/engine_place.cc"
         "graph/partition/graph_partition.cc"
+        "graph/partition/stage_partition.cc"
         "graph/passes/*.cc"
         "graph/preprocess/graph_preprocess.cc"
         "graph/preprocess/insert_op/ge_aipp_op.cc"
@@ -200,6 +203,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "model/ge_root_model.cc"
         "omm/csa_interact.cc"
         "opskernel_manager/ops_kernel_manager.cc"
+        "opskernel_manager/ops_kernel_builder_manager.cc"
         "session/inner_session.cc"
         "session/session_manager.cc"
         "single_op/*.cc"
@@ -283,6 +287,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/manager/graph_manager.cc"
         "graph/manager/graph_manager_utils.cc"
         "graph/manager/graph_mem_allocator.cc"
+        "graph/manager/host_mem_manager.cc"
         "graph/manager/trans_var_data_utils.cc"
         "graph/manager/graph_var_manager.cc"
         "graph/manager/model_manager/event_manager.cc"
@@ -296,6 +301,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/partition/dynamic_shape_partition.cc"
         "graph/partition/engine_place.cc"
         "graph/partition/graph_partition.cc"
+        "graph/partition/stage_partition.cc"
         "graph/passes/*.cc"
         "graph/preprocess/graph_preprocess.cc"
         "graph/preprocess/insert_op/ge_aipp_op.cc"
@@ -349,6 +355,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "model/ge_root_model.cc"
         "omm/csa_interact.cc"
         "opskernel_manager/ops_kernel_manager.cc"
+        "opskernel_manager/ops_kernel_builder_manager.cc"
         "session/inner_session.cc"
         "session/session_manager.cc"
         "single_op/*.cc"
diff --git a/src/ge/analyzer/analyzer.cc b/src/ge/analyzer/analyzer.cc
index b7d09bea..88a0e294 100644
--- a/src/ge/analyzer/analyzer.cc
+++ b/src/ge/analyzer/analyzer.cc
@@ -75,9 +75,8 @@ Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) {
   std::lock_guard<std::recursive_mutex> lg(mutex_);
   auto iter = graph_infos_.find(session_id);
   if (iter == graph_infos_.end()) {
-    auto p = new (std::nothrow) GraphInfo();
-    GE_CHECK_NOTNULL(p);
-    std::shared_ptr<GraphInfo> graph_info(p);
+    std::shared_ptr<GraphInfo> graph_info(new (std::nothrow) GraphInfo());
+    GE_CHECK_NOTNULL(graph_info);
     std::map<uint64_t, std::shared_ptr<GraphInfo>> graph_map;
     graph_map[graph_id] = graph_info;
     graph_info->session_id = session_id;
@@ -86,9 +85,8 @@ Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) {
   } else {
     auto iter1 = (iter->second).find(graph_id);
     if (iter1 == (iter->second).end()) {
-      auto p = new (std::nothrow) GraphInfo();
-      GE_CHECK_NOTNULL(p);
-      std::shared_ptr<GraphInfo> graph_info(p);
+      std::shared_ptr<GraphInfo> graph_info(new (std::nothrow) GraphInfo());
+      GE_CHECK_NOTNULL(graph_info);
       graph_info->session_id = session_id;
       graph_info->graph_id = graph_id;
       (iter->second).insert({graph_id, graph_info});
@@ -100,7 +98,14 @@ Status Analyzer::BuildJsonObject(uint64_t session_id, uint64_t graph_id) {
 }
 
 ge::Status Analyzer::Initialize() {
-  ClearHistoryFile();
+  // Initialize file
+  string real_path = RealPath(kFilePath.c_str());
+  if (real_path.empty()) {
+    GELOGE(FAILED, "File path is invalid.");
+    return FAILED;
+  }
+  json_file_name_ = real_path + "/" + kAnalyzeFile;
+
   return SUCCESS;
 }
 
@@ -138,6 +143,7 @@ void Analyzer::DestroyGraphJsonObject(uint64_t session_id, uint64_t graph_id) {
     if (iter1 == (iter->second).end()) {
       GELOGW("Can not find the graph json object by session_id[%lu] and graph_id[%lu]. Do nothing.", session_id,
              graph_id);
+      return;
     }
     (iter->second).erase(iter1);
   }
@@ -174,15 +180,8 @@ ge::Status Analyzer::CreateAnalyzerFile() {
     return SUCCESS;
   }
   GELOGD("start to create analyzer file!");
-  // Check whether the manifest exists, if not, create it.
-  string real_path = RealPath(kFilePath.c_str());
-  if (real_path.empty()) {
-    GELOGE(FAILED, "File path is invalid.");
-    return FAILED;
-  }
+
   std::lock_guard<std::mutex> lg(file_mutex_);
-  json_file_name_ = real_path + "/" + kAnalyzeFile;
-  GELOGD("Created analyzer file:[%s]", json_file_name_.c_str());
   int fd = open(json_file_name_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kFileAuthority);
   if (fd < 0) {
     GELOGE(INTERNAL_ERROR, "Fail to open the file: %s.", json_file_name_.c_str());
@@ -198,25 +197,27 @@ ge::Status Analyzer::CreateAnalyzerFile() {
   return SUCCESS;
 }
 
-ge::Status Analyzer::SaveAnalyzerDataToFile() {
+ge::Status Analyzer::SaveAnalyzerDataToFile(uint64_t session_id, uint64_t graph_id) {
   GELOGD("start to save analyze file!");
+
+  auto graph_info = GetJsonObject(session_id, graph_id);
+  GE_CHECK_NOTNULL(graph_info);
+  if (graph_info->op_info.size() == 0) {
+    GELOGD("session_id:%lu graph_id:%lu does not owner op info, break it!", session_id, graph_id);
+    return SUCCESS;
+  }
   std::lock_guard<std::mutex> lg(file_mutex_);
-  json_file_.open(json_file_name_, std::ios::out);
+  json_file_.open(json_file_name_, std::ios::app);
   if (!json_file_.is_open()) {
     GELOGE(FAILED, "analyzer file does not exist[%s]", json_file_name_.c_str());
     return PARAM_INVALID;
   }
 
-  std::lock_guard<std::recursive_mutex> lk(mutex_);
-  for (auto &ele : graph_infos_) {
-    for (auto &ele2 : ele.second) {
-      json jsn;
-      GraphInfoToJson(jsn, *(ele2.second));
-      json_file_ << jsn.dump(kJsonDumpLevel) << std::endl;
-    }
-  }
-
+  json jsn;
+  GraphInfoToJson(jsn, *graph_info);
+  json_file_ << jsn.dump(kJsonDumpLevel) << std::endl;
   json_file_.close();
+
   return SUCCESS;
 }
 
@@ -237,13 +238,7 @@ ge::Status Analyzer::DoAnalyze(DataInfo &data_info) {
     return FAILED;
   }
   // create json file
-  status = CreateAnalyzerFile();
-  if (status != SUCCESS) {
-    GELOGE(status, "create analyzer file failed!");
-    return status;
-  }
-  // save data to file
-  return SaveAnalyzerDataToFile();
+  return CreateAnalyzerFile();
 }
 
 ge::Status Analyzer::SaveOpInfo(ge::OpDescPtr desc, DataInfo &data_info,
diff --git a/src/ge/analyzer/analyzer.h b/src/ge/analyzer/analyzer.h
index 1afeeca3..ddc62681 100644
--- a/src/ge/analyzer/analyzer.h
+++ b/src/ge/analyzer/analyzer.h
@@ -156,6 +156,14 @@ class Analyzer {
    */
   ge::Status DoAnalyze(analyzer::DataInfo &data_info);
 
+  /**
+   * @ingroup ge
+   * @brief: Buff analyzed data and output to json file
+   * @param [in]: session id , graph id
+   * @return: 0: SUCCESS other: FAILED
+   */
+  ge::Status SaveAnalyzerDataToFile(uint64_t session_id, uint64_t graph_id);
+
   Analyzer(const Analyzer &) = delete;
   Analyzer &operator=(const Analyzer &) = delete;
   Analyzer(Analyzer &&) = delete;
@@ -166,7 +174,6 @@ class Analyzer {
   void OpInfoToJson(nlohmann::json &j, const analyzer::OpInfo &op_info);
   void GraphInfoToJson(nlohmann::json &j, const analyzer::GraphInfo &graph_info);
 
-  ge::Status SaveAnalyzerDataToFile();
   ge::Status SaveOpInfo(ge::OpDescPtr desc, analyzer::DataInfo &data_info,
                         std::shared_ptr<analyzer::GraphInfo> graph_info);
 
diff --git a/src/ge/client/ge_prof.cc b/src/ge/client/ge_prof.cc
index ad9cc9eb..07c07b67 100644
--- a/src/ge/client/ge_prof.cc
+++ b/src/ge/client/ge_prof.cc
@@ -324,10 +324,17 @@ Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
     return GE_PROF_NOT_INIT;
   }
 
-  Status ret = ProfStopProfiling(&profiler_config->config);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
-    return ret;
+  for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
+    uint64_t data_type_config;
+    Status status = ProfGetDataTypeConfig(profiler_config->config.devIdList[i], data_type_config);
+    if (status != SUCCESS) {
+      GELOGE(status, "Prof get data type config failed, prof result = %d", status);
+      return status;
+    }
+    if (data_type_config != profiler_config->config.dataTypeConfig) {
+      GELOGE(FAILED, "data type config verify failed");
+      return FAILED;
+    }
   }
 
   std::vector<string> prof_params;
@@ -344,12 +351,18 @@ Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
   command.module_index = profiler_config->config.dataTypeConfig;
   GELOGI("Profiling will stop, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
          prof_params[kDeviceListIndex].c_str(), command.module_index);
-  ret = graph_loader.CommandHandle(command);
+  Status ret = graph_loader.CommandHandle(command);
   if (ret != SUCCESS) {
     GELOGE(ret, "Handle profiling command failed");
     return FAILED;
   }
 
+  ret = ProfStopProfiling(&profiler_config->config);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
+    return ret;
+  }
+
   GELOGI("Successfully execute GraphProfStopProfiling.");
   return SUCCESS;
 }
diff --git a/src/ge/client/module.mk b/src/ge/client/module.mk
index 476841c9..1a304cbf 100644
--- a/src/ge/client/module.mk
+++ b/src/ge/client/module.mk
@@ -70,10 +70,9 @@ LOCAL_SHARED_LIBRARIES := \
     libregister \
     libge_compiler \
     libge_common \
-    libmsprof \
-    stub/libascend_hal
+    libmsprof
+
 
-LOCAL_STATIC_LIBRARIES := libmsprofiler
 
 LOCAL_LDFLAGS := -lrt -ldl
 
@@ -108,7 +107,6 @@ LOCAL_SHARED_LIBRARIES := \
     libge_common \
     libmsprof
 
-LOCAL_STATIC_LIBRARIES := libmsprofiler
 
 LOCAL_LDFLAGS := -lrt -ldl
 LOCAL_CFLAGS += \
diff --git a/src/ge/common/auth/file_saver.cc b/src/ge/common/auth/file_saver.cc
index 4aaf9c19..a044e56f 100644
--- a/src/ge/common/auth/file_saver.cc
+++ b/src/ge/common/auth/file_saver.cc
@@ -55,9 +55,26 @@ Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) {
 
 Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size == 0 || data == nullptr, return PARAM_INVALID);
-
+  mmSsize_t write_count;
+  uint32_t size_2g = ((uint32_t)0x1 << 31);
+  uint32_t size_1g = ((uint32_t)0x1 << 30);
   // Write data
-  int32_t write_count = mmWrite(fd, const_cast<void *>(data), size);
+  if (size > size_2g) {
+    auto seek = reinterpret_cast<uint8_t *>(const_cast<void *>(data));
+    while (size > size_1g) {
+      write_count = mmWrite(fd, reinterpret_cast<void *>(seek), size_1g);
+      if (write_count == EN_INVALID_PARAM || write_count == EN_ERROR) {
+        GELOGE(FAILED, "Write data failed. mmpa_errorno = %d, %s", write_count, strerror(errno));
+        return FAILED;
+      }
+      size -= size_1g;
+      seek += size_1g;
+    }
+    write_count = mmWrite(fd, reinterpret_cast<void *>(seek), size);
+  } else {
+    write_count = mmWrite(fd, const_cast<void *>(data), size);
+  }
+
   // -1: Failed to write to file; - 2: Illegal parameter
   if (write_count == EN_INVALID_PARAM || write_count == EN_ERROR) {
     GELOGE(FAILED, "Write data failed. mmpa_errorno = %d, %s", write_count, strerror(errno));
@@ -117,6 +134,7 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi
       WriteData(static_cast<const void *>(&model_partition_table), table_size, fd) != SUCCESS, ret = FAILED; break);
     // Write partition data
     for (const auto &partitionData : partition_datas) {
+      GELOGI("GC:size[%zu]", partitionData.size);
       GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
         WriteData(static_cast<const void *>(partitionData.data), partitionData.size, fd) != SUCCESS, ret = FAILED;
         break);
diff --git a/src/ge/common/convert/pb2json.cc b/src/ge/common/convert/pb2json.cc
deleted file mode 100644
index 0a5d24ee..00000000
--- a/src/ge/common/convert/pb2json.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// File:        pb2json.h
-// Description: This imply file for protobuf message and json interconversion
-
-#include "common/convert/pb2json.h"
-#include <set>
-#include <string>
-#include "securec.h"
-#include "framework/common/fmk_types.h"
-#include "framework/common/debug/ge_log.h"
-
-using std::set;
-using std::string;
-
-namespace ge {
-namespace {
-const int kSignificantDigits = 10;
-}
-// JSON parses non utf8 character throwing exceptions, so some fields need to be shielded through black fields
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void Pb2Json::Message2Json(const ProtobufMsg &message,
-                                                                            const set<string> &black_fields, Json &json,
-                                                                            bool enum2str) {
-  auto descriptor = message.GetDescriptor();
-  auto reflection = message.GetReflection();
-  if (descriptor == nullptr || reflection == nullptr) {
-    return;
-  }
-
-  auto count = descriptor->field_count();
-
-  for (auto i = 0; i < count; ++i) {
-    const auto field = descriptor->field(i);
-    if (field == nullptr) {
-      return;
-    }
-
-    // Do not display weight data
-    if (black_fields.find(field->name()) != black_fields.end()) {
-      continue;
-    }
-
-    if (field->is_repeated()) {
-      if (reflection->FieldSize(message, field) > 0) {
-        RepeatedMessage2Json(message, field, reflection, black_fields, json[field->name()], enum2str);
-      }
-      continue;
-    }
-
-    if (!reflection->HasField(message, field)) {
-      continue;
-    }
-
-    OneField2Json(message, field, reflection, black_fields, json, enum2str);
-  }
-}
-
-void Pb2Json::OneField2Json(const ProtobufMsg &message, const ProtobufFieldDescriptor *field,
-                            const ProtobufReflection *reflection, const set<string> &black_fields, Json &json,
-                            bool enum2str) {
-  switch (field->type()) {
-    case ProtobufFieldDescriptor::TYPE_MESSAGE: {
-      const ProtobufMsg &tmp_message = reflection->GetMessage(message, field);
-      if (0 != tmp_message.ByteSize()) {
-        Message2Json(tmp_message, black_fields, json[field->name()], enum2str);
-      }
-      break;
-    }
-
-    case ProtobufFieldDescriptor::TYPE_BOOL:
-      json[field->name()] = reflection->GetBool(message, field);
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_ENUM: {
-      auto *enum_value_desc = reflection->GetEnum(message, field);
-      Enum2Json(enum_value_desc, field, enum2str, json);
-      break;
-    }
-
-    case ProtobufFieldDescriptor::TYPE_INT32:
-    case ProtobufFieldDescriptor::TYPE_SINT32:
-    case ProtobufFieldDescriptor::TYPE_SFIXED32:
-      json[field->name()] = reflection->GetInt32(message, field);
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_UINT32:
-    case ProtobufFieldDescriptor::TYPE_FIXED32:
-      json[field->name()] = reflection->GetUInt32(message, field);
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_INT64:
-    case ProtobufFieldDescriptor::TYPE_SINT64:
-    case ProtobufFieldDescriptor::TYPE_SFIXED64:
-      json[field->name()] = reflection->GetInt64(message, field);
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_UINT64:
-    case ProtobufFieldDescriptor::TYPE_FIXED64:
-      json[field->name()] = reflection->GetUInt64(message, field);
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_FLOAT:
-      char str[kSignificantDigits];
-      if (sprintf_s(str, kSignificantDigits, "%g", reflection->GetFloat(message, field)) != -1) {
-        json[field->name()] = str;
-      } else {
-        json[field->name()] = reflection->GetFloat(message, field);
-      }
-
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_STRING:
-      json[field->name()] = reflection->GetString(message, field);
-      break;
-
-    case ProtobufFieldDescriptor::TYPE_BYTES: {
-      string field_name = field->name();
-      string type_bytes = reflection->GetString(message, field);
-      json[field_name] = TypeBytes2String(field_name, type_bytes);
-      break;
-    }
-
-    default:
-      break;
-  }
-}
-
-string Pb2Json::TypeBytes2String(string &field_name, string &type_bytes) {
-  if (field_name != "offset") {
-    return type_bytes;
-  }
-  string result = "";
-  for (char temp_value : type_bytes) {
-    uint8_t *value = 0;
-    value = reinterpret_cast<uint8_t *>(&temp_value);
-    char str[kSignificantDigits];
-    if (sprintf_s(str, kSignificantDigits, "%d", *value) == -1) {
-      GELOGW("Convert bytes to string fail, filed name:%s", field_name.c_str());
-      continue;
-    }
-    result += str;
-  }
-  return result;
-}
-
-void Pb2Json::RepeatedMessage2Json(const ProtobufMsg &message, const ProtobufFieldDescriptor *field,
-                                   const ProtobufReflection *reflection, const set<string> &black_fields, Json &json,
-                                   bool enum2str) {
-  if ((field == nullptr) || (reflection == nullptr)) {
-    Message2Json(message, black_fields, json, enum2str);
-    return;
-  }
-
-  for (auto i = 0; i < reflection->FieldSize(message, field); ++i) {
-    Json tmp_json;
-    switch (field->type()) {
-      case ProtobufFieldDescriptor::TYPE_MESSAGE: {
-        const ProtobufMsg &tmp_message = reflection->GetRepeatedMessage(message, field, i);
-        if (0 != tmp_message.ByteSize()) {
-          Message2Json(tmp_message, black_fields, tmp_json, enum2str);
-        }
-      } break;
-
-      case ProtobufFieldDescriptor::TYPE_BOOL:
-        tmp_json = reflection->GetRepeatedBool(message, field, i);
-        break;
-
-      case ProtobufFieldDescriptor::TYPE_ENUM: {
-        auto *enum_value_desc = reflection->GetRepeatedEnum(message, field, i);
-        RepeatedEnum2Json(enum_value_desc, enum2str, tmp_json);
-      } break;
-
-      case ProtobufFieldDescriptor::TYPE_INT32:
-      case ProtobufFieldDescriptor::TYPE_SINT32:
-      case ProtobufFieldDescriptor::TYPE_SFIXED32:
-        tmp_json = reflection->GetRepeatedInt32(message, field, i);
-        break;
-
-      case ProtobufFieldDescriptor::TYPE_UINT32:
-      case ProtobufFieldDescriptor::TYPE_FIXED32:
-        tmp_json = reflection->GetRepeatedUInt32(message, field, i);
-        break;
-
-      case ProtobufFieldDescriptor::TYPE_INT64:
-      case ProtobufFieldDescriptor::TYPE_SINT64:
-      case ProtobufFieldDescriptor::TYPE_SFIXED64:
-        tmp_json = reflection->GetRepeatedInt64(message, field, i);
-        break;
-
-      case ProtobufFieldDescriptor::TYPE_UINT64:
-      case ProtobufFieldDescriptor::TYPE_FIXED64:
-        tmp_json = reflection->GetRepeatedUInt64(message, field, i);
-        break;
-
-      case ProtobufFieldDescriptor::TYPE_FLOAT:
-        tmp_json = reflection->GetRepeatedFloat(message, field, i);
-        break;
-
-      case ProtobufFieldDescriptor::TYPE_STRING:
-      case ProtobufFieldDescriptor::TYPE_BYTES:
-        tmp_json = reflection->GetRepeatedString(message, field, i);
-        break;
-
-      default:
-        break;
-    }
-    json += tmp_json;
-  }
-}
-
-void Pb2Json::Enum2Json(const ProtobufEnumValueDescriptor *enum_value_desc, const ProtobufFieldDescriptor *field,
-                        bool enum2str, Json &json) {
-  if (enum_value_desc != nullptr) {
-    if (field == nullptr) {
-      return;
-    }
-    if (enum2str) {
-      json[field->name()] = enum_value_desc->name();
-    } else {
-      json[field->name()] = enum_value_desc->number();
-    }
-  }
-}
-
-void Pb2Json::RepeatedEnum2Json(const ProtobufEnumValueDescriptor *enum_value_desc, bool enum2str, Json &json) {
-  if (enum_value_desc != nullptr) {
-    if (enum2str) {
-      json = enum_value_desc->name();
-    } else {
-      json = enum_value_desc->number();
-    }
-  }
-}
-}  //  namespace ge
diff --git a/src/ge/common/convert/pb2json.h b/src/ge/common/convert/pb2json.h
deleted file mode 100644
index 88ded50e..00000000
--- a/src/ge/common/convert/pb2json.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// File:        pb2json.h
-// Description: This header file for protobuf message and json interconversion
-
-#ifndef GE_COMMON_CONVERT_PB2JSON_H_
-#define GE_COMMON_CONVERT_PB2JSON_H_
-#include <functional>
-#include <memory>
-#include <set>
-#include <string>
-#include "google/protobuf/descriptor.h"
-#include "google/protobuf/message.h"
-#include "nlohmann/json.hpp"
-
-namespace ge {
-using Json = nlohmann::json;
-using ProtobufMsg = ::google::protobuf::Message;
-using ProtobufReflection = ::google::protobuf::Reflection;
-using ProtobufFieldDescriptor = ::google::protobuf::FieldDescriptor;
-using ProtobufDescriptor = ::google::protobuf::Descriptor;
-using ProtobufEnumValueDescriptor = ::google::protobuf::EnumValueDescriptor;
-
-class Pb2Json {
- public:
-  /**
-   * @ingroup domi_omg
-   * @brief Transfer protobuf object to JSON object
-   * @param [out] json Converted JSON object
-   * @return void success
-   * @author
-   */
-  static void Message2Json(const ProtobufMsg &message, const std::set<std::string> &black_fields, Json &json,
-                           bool enum2str = false);
-
- protected:
-  static void RepeatedMessage2Json(const ProtobufMsg &message, const ProtobufFieldDescriptor *field,
-                                   const ProtobufReflection *reflection, const std::set<std::string> &black_fields,
-                                   Json &json, bool enum2str);
-
-  static void Enum2Json(const ProtobufEnumValueDescriptor *enum_value_desc, const ProtobufFieldDescriptor *field,
-                        bool enum2str, Json &json);
-
-  static void RepeatedEnum2Json(const ProtobufEnumValueDescriptor *enum_value_desc, bool enum2str, Json &json);
-
-  static void OneField2Json(const ProtobufMsg &message, const ProtobufFieldDescriptor *field,
-                            const ProtobufReflection *reflection, const std::set<std::string> &black_fields, Json &json,
-                            bool enum2str);
-
-  static std::string TypeBytes2String(std::string &field_name, std::string &type_bytes);
-};
-}  // namespace ge
-
-#endif  // GE_COMMON_CONVERT_PB2JSON_H_
diff --git a/src/ge/common/dump/dump_properties.cc b/src/ge/common/dump/dump_properties.cc
index b6247c6e..aec70cf9 100644
--- a/src/ge/common/dump/dump_properties.cc
+++ b/src/ge/common/dump/dump_properties.cc
@@ -201,7 +201,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch(
-  const std::string &dump_op_switch) {
+  const std::string dump_op_switch) {
   dump_op_switch_ = dump_op_switch;
 }
 
diff --git a/src/ge/common/dump/dump_properties.h b/src/ge/common/dump/dump_properties.h
index 7909d5a5..9fefc7d6 100644
--- a/src/ge/common/dump/dump_properties.h
+++ b/src/ge/common/dump/dump_properties.h
@@ -65,7 +65,7 @@ class DumpProperties {
 
   const std::string &GetDumpStatus() const;
 
-  void SetDumpOpSwitch(const std::string &dump_op_switch);
+  void SetDumpOpSwitch(const std::string dump_op_switch);
 
   const std::string &GetDumpOpSwitch() const;
 
diff --git a/src/ge/common/ge/tbe_plugin_manager.cc b/src/ge/common/ge/tbe_plugin_manager.cc
index 8a594cb9..731d7bb0 100644
--- a/src/ge/common/ge/tbe_plugin_manager.cc
+++ b/src/ge/common/ge/tbe_plugin_manager.cc
@@ -94,13 +94,6 @@ void TBEPluginManager::ProcessSoFullName(vector<string> &file_list, string &caff
       full_name.compare(full_name.size() - caffe_parser_so_suff.size(), caffe_parser_so_suff.size(),
                         caffe_parser_so_suff) == 0) {
     caffe_parser_path = full_name;
-  } else if ((full_name.size() >= aicpu_so_suff.size() &&
-              full_name.compare(full_name.size() - aicpu_so_suff.size(), aicpu_so_suff.size(), aicpu_so_suff) == 0) ||
-             (full_name.size() >= aicpu_host_so_suff.size() &&
-              full_name.compare(full_name.size() - aicpu_host_so_suff.size(), aicpu_host_so_suff.size(),
-                                aicpu_host_so_suff) == 0)) {
-    // aicpu so, Put the file path into the omgcontext and save into the model in the builder stage.
-    domi::GetContext().aicpu_op_run_paths.push_back(full_name);
   } else {
     // Save parser so path into file_list vector
     file_list.push_back(full_name);
@@ -230,39 +223,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void TBEPluginManager::LoadPlug
   }
 }
 
-Status TBEPluginManager::CheckCustomAiCpuOpLib() {
-  std::vector<std::string> vec_op_type;
-
-  domi::OpRegistry::Instance()->GetOpTypeByImplyType(vec_op_type, domi::ImplyType::CUSTOM);
-  for (size_t i = 0; i < vec_op_type.size(); i++) {
-    bool aicpu_so_exist = false;
-    std::string ai_cpu_so_name = "lib" + vec_op_type[i] + "_aicpu.so";
-    for (size_t j = 0; j < domi::GetContext().aicpu_op_run_paths.size(); j++) {
-      string bin_file_path = domi::GetContext().aicpu_op_run_paths[j];
-      if (bin_file_path.size() >= ai_cpu_so_name.size() &&
-          bin_file_path.compare(bin_file_path.size() - ai_cpu_so_name.size(), ai_cpu_so_name.size(), ai_cpu_so_name) ==
-            0) {
-        aicpu_so_exist = true;
-        break;
-      }
-    }
-    if (!aicpu_so_exist) {
-      GELOGE(FAILED, "Can't find aicpu run so(%s), please check the plugin path!", ai_cpu_so_name.c_str());
-      return FAILED;
-    }
-  }
-  return SUCCESS;
-}
-
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void TBEPluginManager::InitPreparation(
   const std::map<string, string> &options) {
   options_.insert(options.begin(), options.end());
   // Load TBE plugin
   TBEPluginManager::Instance().LoadCustomOpLib();
-  Status ret = CheckCustomAiCpuOpLib();
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Check custom aicpu run so failed!");
-    return;
-  }
 }
 }  // namespace ge
diff --git a/src/ge/common/ge/tbe_plugin_manager.h b/src/ge/common/ge/tbe_plugin_manager.h
index 2a55e450..4430d4aa 100644
--- a/src/ge/common/ge/tbe_plugin_manager.h
+++ b/src/ge/common/ge/tbe_plugin_manager.h
@@ -62,7 +62,6 @@ class TBEPluginManager {
   static void GetPluginSoFileList(const string &path, vector<string> &file_list, string &caffe_parser_path);
   static void GetCustomOpPath(std::string &customop_path);
   void LoadCustomOpLib();
-  static Status CheckCustomAiCpuOpLib();
 
   SoHandlesVec handles_vec_;
   static std::map<string, string> options_;
diff --git a/src/ge/common/ge_common.mk b/src/ge/common/ge_common.mk
index e40ef3c1..efddc788 100644
--- a/src/ge/common/ge_common.mk
+++ b/src/ge/common/ge_common.mk
@@ -71,7 +71,10 @@ GE_COMMON_LOCAL_C_INCLUDES := \
     $(TOPDIR)third_party/openssl/include/x86/include \
     $(TOPDIR)framework/domi \
     $(TOPDIR)framework/domi/common \
-    $(TOPDIR)framework/domi/common/op
+    $(TOPDIR)framework/domi/common/op \
+    $(TOPDIR)graphengine/ge   \
+    $(TOPDIR)graphengine/ge/common   \
+    $(TOPDIR)graphengine/ge/common/op   \
 
 #compile host libge_common
 include $(CLEAR_VARS)
diff --git a/src/ge/common/helper/model_cache_helper.cc b/src/ge/common/helper/model_cache_helper.cc
index b1a71b0a..0c8fddda 100644
--- a/src/ge/common/helper/model_cache_helper.cc
+++ b/src/ge/common/helper/model_cache_helper.cc
@@ -1497,7 +1497,6 @@ Status ModelCacheHelper::ParseMemResourceFromJson(const Json &json, map<rtMemTyp
   }
   mem_resource.clear();
   for (const Json &mem_resource_json : json) {
-    MemResource var_addr_mgr;
     try {
       rtMemType_t mem_type = mem_resource_json[kMemType].get<rtMemType_t>();
       uint64_t var_mem_size = mem_resource_json[kVarMemSize].get<int64_t>();
diff --git a/src/ge/common/op/attr_value_util.cc b/src/ge/common/op/attr_value_util.cc
index 5d74aa1d..6d963181 100644
--- a/src/ge/common/op/attr_value_util.cc
+++ b/src/ge/common/op/attr_value_util.cc
@@ -17,6 +17,7 @@
 #include "framework/common/op/attr_value_util.h"
 #include "framework/common/debug/log.h"
 #include "framework/common/util.h"
+#include "register/register_types.h"
 
 namespace ge {
 #define DEFINE_SET_ATTR_VALUE_ONE(ARG_TYPE, FIELD)                        \
diff --git a/src/ge/common/op/ge_op_utils.cc b/src/ge/common/op/ge_op_utils.cc
index 1dc268b2..d7d56ec5 100644
--- a/src/ge/common/op/ge_op_utils.cc
+++ b/src/ge/common/op/ge_op_utils.cc
@@ -27,6 +27,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/op/attr_value_util.h"
 #include "framework/common/util.h"
+#include "framework/common/types.h"
 #include "graph/anchor.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/utils/op_desc_utils.h"
diff --git a/src/ge/common/profiling/profiling_manager.cc b/src/ge/common/profiling/profiling_manager.cc
index 9492045c..692e50b0 100644
--- a/src/ge/common/profiling/profiling_manager.cc
+++ b/src/ge/common/profiling/profiling_manager.cc
@@ -353,20 +353,18 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProf
   }
   uint64_t module = GetProfilingModule();
   int32_t device_num = static_cast<int32_t>(device_id_.size());
-  uint32_t *device_id_ptr = new (std::nothrow) uint32_t[device_num];
+  auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
   if (device_id_ptr == nullptr) {
-    GELOGE(FAILED, "Stop profiling device id ptr is null.");
+    GELOGE(FAILED, "Stop profiling: device id ptr is null.");
     return;
   }
   for (int32_t i = 0; i < device_num; i++) {
     device_id_ptr[i] = static_cast<uint32_t>(device_id_[i]);
   }
-  rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr);
+  rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
     GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret);
   }
-  delete[] device_id_ptr;
-  device_id_ptr = nullptr;
 
   for (size_t i = 0; i < prof_handle_vec_.size(); ++i) {
     int result = ProfMgrStop(prof_handle_vec_[i]);
@@ -732,23 +730,21 @@ ProfilingManager::ProfStartProfiling(uint64_t module, const std::map<std::string
     GELOGE(FAILED, "Prof start parse param failed.");
     return FAILED;
   }
-  auto *device_id = new (std::nothrow) uint32_t[device_num];
-  if (device_id == nullptr) {
-    GELOGE(FAILED, "Prof start parse param failed.");
+
+  auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
+  if (device_id_ptr == nullptr) {
+    GELOGE(FAILED, "Prof start: device id ptr is null.");
     return FAILED;
   }
   for (int32_t i = 0; i < device_num; i++) {
-    device_id[i] = static_cast<uint32_t>(device_list[i]);
+    device_id_ptr[i] = static_cast<uint32_t>(device_list[i]);
   }
   GELOGI("Runtime config param: 0x%llx, device num: %d.", module, device_num);
-  rtError_t rt_ret = rtProfilerStart(module, device_num, device_id);
+  rtError_t rt_ret = rtProfilerStart(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
-    delete[] device_id;
     GELOGE(FAILED, "Runtime profiler config proc failed.");
     return FAILED;
   }
-  delete[] device_id;
-  device_id = nullptr;
   if ((module & PROF_MODEL_EXECUTE_MASK) == PROF_MODEL_EXECUTE_MASK) {
     for (int32_t i = 0; i < device_num; i++) {
       if (std::find(device_id_.begin(), device_id_.end(), device_list[i]) == device_id_.end()) {
@@ -776,23 +772,20 @@ ProfilingManager::ProfStopProfiling(uint64_t module, const std::map<std::string,
     GELOGE(FAILED, "Prof stop parse param failed.");
     return FAILED;
   }
-  auto *device_id = new (std::nothrow) uint32_t[device_num];
-  if (device_id == nullptr) {
-    GELOGE(FAILED, "Prof stop parse param failed.");
+  auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
+  if (device_id_ptr == nullptr) {
+    GELOGE(FAILED, "Prof stop: device id ptr is null.");
     return FAILED;
   }
   for (int32_t i = 0; i < device_num; i++) {
-    device_id[i] = static_cast<uint32_t>(device_list[i]);
+    device_id_ptr[i] = static_cast<uint32_t>(device_list[i]);
   }
   GELOGI("Prof stop: runtime config param: 0x%llx, device num: %d", module, device_num);
-  rtError_t rt_ret = rtProfilerStop(module, device_num, device_id);
+  rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
-    delete[] device_id;
     GELOGE(FAILED, "Prof stop: runtime profiler config proc failed.");
     return FAILED;
   }
-  delete[] device_id;
-  device_id = nullptr;
   uint64_t execute_model_mask = module & PROF_MODEL_EXECUTE_MASK;
   if (execute_model_mask == PROF_MODEL_EXECUTE_MASK) {
     for (int32_t i = 0; i < device_num; i++) {
diff --git a/src/ge/common/types.cc b/src/ge/common/types.cc
index de293d34..ecda37e2 100644
--- a/src/ge/common/types.cc
+++ b/src/ge/common/types.cc
@@ -384,6 +384,7 @@ REGISTER_OPTYPE_DEFINE(HCOMREDUCESCATTER, "HcomReduceScatter");
 REGISTER_OPTYPE_DEFINE(HCOMSEND, "HcomSend");
 REGISTER_OPTYPE_DEFINE(HCOMRECEIVE, "HcomReceive");
 REGISTER_OPTYPE_DEFINE(HCOMREMOTEREAD, "HcomRemoteRead");
+REGISTER_OPTYPE_DEFINE(HCOMREMOTEREFREAD, "HcomRemoteRefRead");
 REGISTER_OPTYPE_DEFINE(HCOMREMOTEWRITE, "HcomRemoteWrite");
 
 REGISTER_OPTYPE_DEFINE(VARASSIGN, "VarAssign");
diff --git a/src/ge/common/util.cc b/src/ge/common/util.cc
index 4adf3ebd..e4bf17f2 100644
--- a/src/ge/common/util.cc
+++ b/src/ge/common/util.cc
@@ -54,8 +54,7 @@ const int kProtoReadBytesLimit = INT_MAX;     // Max size of 2 GB minus 1 byte.
 const int kWarningThreshold = 536870912 * 2;  // 536870912 represent 512M
 
 /// The maximum length of the file.
-/// Based on the security coding specification and the current actual (protobuf) model size, it is determined as 2G-1
-const int kMaxFileSizeLimit = INT_MAX;
+const uint32_t kMaxFileSizeLimit = UINT32_MAX;  // 4G for now
 const int kMaxBuffSize = 256;
 const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character";
 constexpr uint32_t kMaxConfigFileByte = 10 * 1024 * 1024;
@@ -186,7 +185,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadBytesFromBinaryFile(co
   std::streamsize size = file.tellg();
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((size <= 0), file.close(); return false, "file length <= 0, not valid.");
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > kMaxFileSizeLimit, file.close();
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > static_cast<int64_t>(kMaxFileSizeLimit), file.close();
                                  return false, "file size %ld is out of limit: %d.", size, kMaxFileSizeLimit);
 
   file.seekg(0, std::ios::beg);  // [no need to check value]
@@ -304,7 +303,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromMem(const cha
   return ret;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestap() {
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp() {
   struct timeval tv {};
   int ret = gettimeofday(&tv, nullptr);
   GE_LOGE_IF(ret != 0, "Func gettimeofday may failed: ret=%d", ret);
diff --git a/src/ge/engine_manager/dnnengine_manager.cc b/src/ge/engine_manager/dnnengine_manager.cc
index 3389e1b9..0c1ba4e3 100644
--- a/src/ge/engine_manager/dnnengine_manager.cc
+++ b/src/ge/engine_manager/dnnengine_manager.cc
@@ -216,9 +216,9 @@ std::string DNNEngineManager::GetDNNEngineName(const ge::NodePtr &node_ptr) {
     if (kernel_info_store != kernel_map.end()) {
       std::string unsupported_reason;
       // It will be replaced by engine' checksupport
-      uint64_t start_time = GetCurrentTimestap();
+      uint64_t start_time = GetCurrentTimestamp();
       if (kernel_info_store->second->CheckSupported(op_desc, unsupported_reason)) {
-        checksupport_cost_[kernel_name] += GetCurrentTimestap() - start_time;
+        checksupport_cost_[kernel_name] += GetCurrentTimestamp() - start_time;
         op_desc->SetOpEngineName(it.engine);
         op_desc->SetOpKernelLibName(kernel_name);
         // set attrs for taking information when load txt to graph object
@@ -228,7 +228,7 @@ std::string DNNEngineManager::GetDNNEngineName(const ge::NodePtr &node_ptr) {
                it.engine.c_str(), op_desc->GetName().c_str());
         return it.engine;
       } else {
-        checksupport_cost_[kernel_name] += GetCurrentTimestap() - start_time;
+        checksupport_cost_[kernel_name] += GetCurrentTimestamp() - start_time;
         bool is_custom_op = false;
         if ((ge::AttrUtils::GetBool(op_desc, kCustomOpFlag, is_custom_op)) && is_custom_op) {
           ErrorManager::GetInstance().ATCReportErrMessage("E13001", {"kernelname", "optype", "opname"},
diff --git a/src/ge/engine_manager/engine_conf.json b/src/ge/engine_manager/engine_conf.json
index 82360562..ad43c9ab 100755
--- a/src/ge/engine_manager/engine_conf.json
+++ b/src/ge/engine_manager/engine_conf.json
@@ -41,6 +41,13 @@
           "skip_assign_stream": false,
           "attach": true
         },
+        {
+          "id": "DNN_VM_AICPU_ASCEND",
+          "name": "AICPU_ASCEND",
+          "independent": false,
+          "skip_assign_stream": false,
+          "attach": true
+        },
         {
           "id": "DNN_HCCL",
           "name": "HCCL",
diff --git a/src/ge/executor/ge_executor.cc b/src/ge/executor/ge_executor.cc
index 0a247142..9c116fee 100644
--- a/src/ge/executor/ge_executor.cc
+++ b/src/ge/executor/ge_executor.cc
@@ -38,6 +38,7 @@
 #include "single_op/single_op_manager.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/load/new_model_manager/davinci_model.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 using std::string;
 using std::vector;
@@ -241,12 +242,16 @@ Status GeExecutor::Initialize() {
   }
 
   std::vector<rtMemType_t> mem_type(1, RT_MEMORY_HBM);
+  mem_type.push_back(RT_MEMORY_P2P_DDR);
   auto ret = MemManager::Instance().Initialize(mem_type);
   if (ret != SUCCESS) {
     GELOGE(ret, "Memory Manager init failed.");
     return ret;
   }
 
+  GE_CHK_STATUS_RET(OpsKernelBuilderManager::Instance().Initialize({}, false),
+                    "Failed to initialize OpsKernelBuilders");
+
   // Start profiling
   Options profiling_options;
   profiling_options.device_id = 0;
@@ -265,6 +270,8 @@ Status GeExecutor::Finalize() {
     return ge::SUCCESS;
   }
 
+  (void)OpsKernelBuilderManager::Instance().Finalize();
+
   // Stop profiling
   if (ProfilingManager::Instance().ProfilingOn()) {
     ProfilingManager::Instance().StopProfiling();
@@ -282,11 +289,14 @@ Status GeExecutor::SetDynamicBatchSize(uint32_t model_id, void *dynamic_input_ad
     return PARAM_INVALID;
   }
 
-  uint64_t size = sizeof(uint64_t);
+  uint64_t size = sizeof(uint32_t);
   if (length < size) {
     GELOGE(PARAM_INVALID, "Dynamic input size [%lu] is less than [%lu]!", length, size);
     return PARAM_INVALID;
   }
+  if (length >= sizeof(uint64_t)) {
+    size = sizeof(uint64_t);
+  }
 
   // Verify whether the input dynamic batch matches the model gear
   std::vector<std::vector<int64_t>> batch_info;
@@ -324,12 +334,15 @@ Status GeExecutor::SetDynamicImageSize(uint32_t model_id, void *dynamic_input_ad
     return PARAM_INVALID;
   }
 
-  uint64_t dynamic_input_size = kDynamicImageSizeInputSize * sizeof(uint64_t);
+  uint64_t dynamic_input_size = kDynamicImageSizeInputSize * sizeof(uint32_t);
   if (length < dynamic_input_size) {
     GELOGE(PARAM_INVALID, "Dynamic input size [%lu] is less than [%lu]!", length, dynamic_input_size);
     return PARAM_INVALID;
   }
-
+  uint64_t size = sizeof(uint32_t);
+  if (length >= kDynamicImageSizeInputSize * sizeof(uint64_t)) {
+    size = sizeof(uint64_t);
+  }
   // Verify whether the input dynamic resolution matches the model gear
   std::vector<std::vector<int64_t>> batch_info;
   std::vector<uint64_t> batch_num{image_height, image_width};
@@ -350,18 +363,18 @@ Status GeExecutor::SetDynamicImageSize(uint32_t model_id, void *dynamic_input_ad
     GELOGE(ret, "Set dynamic size failed");
     return ret;
   }
+
   // Memcpy dynamic resolution height from host to device
-  rtError_t rt_ret =
-    rtMemcpy(dynamic_input_addr, sizeof(uint64_t), &image_height, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
+  rtError_t rt_ret = rtMemcpy(dynamic_input_addr, size, &image_height, size, RT_MEMCPY_HOST_TO_DEVICE);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "memcpy dynamic resolution input data failed! ret: 0x%X", rt_ret);
     return RT_ERROR_TO_GE_STATUS(rt_ret);
   }
 
-  uint64_t remain_size = length - sizeof(uint64_t);
+  uint64_t remain_size = length - size;
   // Memcpy dynamic resolution width from host to device
-  if (rtMemcpy(reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(dynamic_input_addr) + sizeof(uint64_t)),
-               remain_size, &image_width, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) {
+  if (rtMemcpy(reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(dynamic_input_addr) + size), remain_size,
+               &image_width, size, RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) {
     GELOGE(FAILED, "memcpy dynamic resolution input data failed!");
     return FAILED;
   }
@@ -401,16 +414,19 @@ Status GeExecutor::SetDynamicDims(uint32_t model_id, void *dynamic_input_addr, u
   }
 
   size_t dynamic_dim_num = cur_dynamic_dims.size();
-  uint64_t dynamic_input_size = static_cast<uint64_t>(dynamic_dim_num * sizeof(uint64_t));
+  uint64_t dynamic_input_size = static_cast<uint64_t>(dynamic_dim_num * sizeof(uint32_t));
   if (length < dynamic_input_size) {
     GELOGE(FAILED, "Dynamic input size [%lu] is less than [%lu]!", length, dynamic_input_size);
     return FAILED;
   }
+  uint64_t size = sizeof(uint32_t);
+  if (length >= dynamic_dim_num * sizeof(uint64_t)) {
+    size = sizeof(uint64_t);
+  }
   for (uint32_t i = 0; i < dynamic_dim_num; ++i) {
     // Memcpy dynamic dim[i] from host to device
-    if (rtMemcpy(reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(dynamic_input_addr) + sizeof(uint64_t) * i),
-                 length - sizeof(uint64_t) * i, &cur_dynamic_dims[i], sizeof(uint64_t),
-                 RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) {
+    if (rtMemcpy(reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(dynamic_input_addr) + size * i),
+                 length - size * i, &cur_dynamic_dims[i], size, RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) {
       GELOGE(FAILED, "memcpy dynamic resolution input data failed!");
       return FAILED;
     }
@@ -1113,7 +1129,7 @@ Status GeExecutor::SetDump(const DumpConfig &dump_config) {
     GELOGE(ret, "Set dump conf failed");
     return ret;
   }
-  GELOGI("Set dump config succ.");
+  GELOGI("Set dump config successfully");
   return SUCCESS;
 }
 }  // namespace ge
diff --git a/src/ge/executor/module.mk b/src/ge/executor/module.mk
index 1c3efe4c..c6831077 100644
--- a/src/ge/executor/module.mk
+++ b/src/ge/executor/module.mk
@@ -50,6 +50,7 @@ local_ge_executor_src_files :=  \
     ../graph/load/new_model_manager/task_info/end_graph_task_info.cc        \
     ../graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc   \
     ../graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc  \
+    ../opskernel_manager/ops_kernel_builder_manager.cc \
     ../single_op/single_op_manager.cc \
     ../single_op/single_op_model.cc \
     ../single_op/single_op.cc \
@@ -74,6 +75,7 @@ local_ge_executor_c_include :=             \
     $(TOPDIR)inc/framework                 \
     $(TOPDIR)inc                           \
     $(LOCAL_PATH)/../                      \
+    $(TOPDIR)graphengine/ge                \
     $(TOPDIR)libc_sec/include              \
     third_party/protobuf/include           \
     third_party/json/include               \
@@ -89,7 +91,6 @@ local_ge_executor_shared_library :=        \
     libregister                            \
     libmsprof                              \
     liberror_manager                       \
-    libascend_hal
 
 local_ge_executor_ldflags := -lrt -ldl     \
 
@@ -105,7 +106,12 @@ LOCAL_SRC_FILES := $(local_ge_executor_src_files)
 LOCAL_C_INCLUDES := $(local_ge_executor_c_include)
 
 LOCAL_SHARED_LIBRARIES := $(local_ge_executor_shared_library)
-LOCAL_STATIC_LIBRARIES := libmsprofiler
+
+LOCAL_SHARED_LIBRARIES += libascend_hal
+
+LOCAL_STATIC_LIBRARIES := \
+    libmsprofiler \
+
 ifeq ($(device_os),android)
 LOCAL_LDFLAGS += -ldl
 LOCAL_LDLIBS += -L$(PWD)/prebuilts/clang/linux-x86/aarch64/android-ndk-r21/sysroot/usr/lib/aarch64-linux-android/29 -llog
@@ -142,9 +148,10 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libregister                            \
     libmsprof                              \
     liberror_manager                       \
-    stub/libascend_hal
+    stub/libascend_hal                     \
 
-LOCAL_STATIC_LIBRARIES := libmsprofiler
+LOCAL_STATIC_LIBRARIES := \
+    libmsprofiler \
 
 LOCAL_LDFLAGS += $(local_ge_executor_ldflags)
 
diff --git a/src/ge/ge_inference.mk b/src/ge/ge_inference.mk
index 621e42c5..ac106346 100644
--- a/src/ge/ge_inference.mk
+++ b/src/ge/ge_inference.mk
@@ -42,6 +42,7 @@ GRAPH_MANAGER_LOCAL_SRC_FILES := \
     session/session_manager.cc \
     engine_manager/dnnengine_manager.cc \
     opskernel_manager/ops_kernel_manager.cc \
+    opskernel_manager/ops_kernel_builder_manager.cc \
     graph/manager/graph_manager.cc \
     graph/manager/graph_manager_utils.cc \
     graph/manager/graph_context.cc \
@@ -57,9 +58,11 @@ GRAPH_MANAGER_LOCAL_SRC_FILES := \
     graph/partition/engine_place.cc \
     graph/partition/graph_partition.cc \
     graph/partition/dynamic_shape_partition.cc \
+    graph/partition/stage_partition.cc \
     generator/ge_generator.cc \
     generator/generator_api.cc \
     graph/manager/graph_var_manager.cc \
+    graph/manager/host_mem_manager.cc \
     graph/manager/rdma_pool_allocator.cc \
     graph/manager/graph_mem_allocator.cc \
     graph/manager/graph_caching_allocator.cc \
@@ -178,6 +181,7 @@ OMG_HOST_SRC_FILES := \
     graph/passes/multi_batch_pass.cc \
     graph/passes/multi_batch_clone_pass.cc \
     graph/passes/subexpression_migration_pass.cc \
+    graph/passes/subgraph_const_migration_pass.cc \
     graph/passes/unused_args_clean_pass.cc \
     graph/passes/next_iteration_pass.cc \
     graph/passes/control_trigger_pass.cc \
@@ -343,6 +347,7 @@ DEVICE_LOCAL_C_INCLUDES := \
     $(TOPDIR)inc/runtime \
     $(TOPDIR)ops/built-in/op_proto/inc \
     $(TOPDIR)framework/domi \
+    $(TOPDIR)graphengine/ge \
     $(TOPDIR)toolchain/ide/ide-daemon/external \
     third_party/json/include \
     third_party/protobuf/include \
diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.cc b/src/ge/ge_local_engine/engine/host_cpu_engine.cc
index fc46385b..2a3a9a61 100644
--- a/src/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/src/ge/ge_local_engine/engine/host_cpu_engine.cc
@@ -32,26 +32,6 @@ namespace {
   case (DTYPE): {                                                                                                     \
     GeTensorPtr ge_tensor = nullptr;                                                                                  \
     if (need_create_flag) {                                                                                           \
-      int64_t num_size = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize();                     \
-      if (out_desc.GetShape().IsUnknownShape()) {                                                                     \
-        std::vector<std::pair<int64_t, int64_t>> range;                                                               \
-        if (out_desc.GetShapeRange(range) != GRAPH_SUCCESS) {                                                         \
-          GELOGE(INTERNAL_ERROR, "Get shape range failed, node:%s", op_desc->GetName().c_str());                      \
-          return INTERNAL_ERROR;                                                                                      \
-        }                                                                                                             \
-        int64_t max_range_size = 1;                                                                                   \
-        for (const auto &item : range) {                                                                              \
-          FMK_INT64_MULCHECK(max_range_size, item.second);                                                            \
-          max_range_size *= item.second;                                                                              \
-        }                                                                                                             \
-        num_size = max_range_size;                                                                                    \
-      }                                                                                                               \
-      if (num_size < 0) {                                                                                             \
-        GELOGE(INTERNAL_ERROR, "node:%s, get size for output %zu failed, num=%lld", op_desc->GetName().c_str(), i,    \
-               num_size);                                                                                             \
-        return INTERNAL_ERROR;                                                                                        \
-      }                                                                                                               \
-      auto data_num = static_cast<uint64_t>(num_size);                                                                \
       GELOGI("node:%s allocate output %zu start, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE)); \
       std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]());                                               \
       if (buf == nullptr) {                                                                                           \
@@ -93,6 +73,29 @@ const char *kEnvKeyOppPath = "ASCEND_OPP_PATH";
 const char *kHostCpuLibRelativePath = "/op_impl/built-in/host_cpu";
 }  // namespace
 
+Status GetDataNumber(const GeTensorDesc &out_desc, uint64_t &data_num) {
+  int64_t num_size = out_desc.GetShape().IsScalar() ? 1 : out_desc.GetShape().GetShapeSize();
+  if (out_desc.GetShape().IsUnknownShape()) {
+    std::vector<std::pair<int64_t, int64_t>> range;
+    if (out_desc.GetShapeRange(range) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Get shape range failed.");
+      return INTERNAL_ERROR;
+    }
+    int64_t max_range_size = 1;
+    for (const auto &item : range) {
+      FMK_INT64_MULCHECK(max_range_size, item.second);
+      max_range_size *= item.second;
+    }
+    num_size = max_range_size;
+  }
+  if (num_size < 0) {
+    GELOGE(INTERNAL_ERROR, "Get negative size, num_size=%lld.", num_size);
+    return INTERNAL_ERROR;
+  }
+  data_num = static_cast<uint64_t>(num_size);
+  return SUCCESS;
+}
+
 void HostCpuEngine::CloseSo() {
   for (auto handle : lib_handles_) {
     if (dlclose(handle) != 0) {
@@ -169,13 +172,20 @@ Status HostCpuEngine::PrepareInputs(const ge::ConstOpDescPtr &op_desc, const vec
 Status HostCpuEngine::PrepareOutputs(const ge::ConstOpDescPtr &op_desc, vector<GeTensorPtr> &outputs,
                                      map<std::string, Tensor> &named_outputs) {
   if (!outputs.empty() && (outputs.size() != op_desc->GetOutputsSize())) {
-    GELOGW("size of ouputs not match, size of outputs = %zu, exactly output_num=%zu.", outputs.size(),
+    GELOGW("size of outputs not match, size of outputs = %zu, exactly output_num=%zu.", outputs.size(),
            op_desc->GetOutputsSize());
     outputs.clear();
   }
   bool need_create_flag = (outputs.size() != op_desc->GetOutputsSize());
   for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
     const auto &out_desc = op_desc->GetOutputDesc(i);
+    uint64_t data_num = 0;
+    if (need_create_flag) {
+      if (GetDataNumber(out_desc, data_num) != SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "node:%s, get size for output %zu failed", op_desc->GetName().c_str(), i);
+        return INTERNAL_ERROR;
+      }
+    }
     switch (out_desc.GetDataType()) {
       CREATE_OUTPUT_CASE(DT_BOOL, bool)
       CREATE_OUTPUT_CASE(DT_INT8, int8_t)
diff --git a/src/ge/ge_local_engine/module.mk b/src/ge/ge_local_engine/module.mk
index 3307f780..a0247da7 100644
--- a/src/ge/ge_local_engine/module.mk
+++ b/src/ge/ge_local_engine/module.mk
@@ -8,6 +8,12 @@ local_lib_src_files :=  engine/ge_local_engine.cc \
                         ops_kernel_store/op/ge_deleted_op.cc \
                         ops_kernel_store/op/no_op.cc \
 
+ops_kernel_builder_src_files := ops_kernel_store/ge_local_ops_kernel_builder.cc \
+                                ops_kernel_store/op/op_factory.cc \
+                                ops_kernel_store/op/op.cc \
+                                ops_kernel_store/op/ge_deleted_op.cc \
+                                ops_kernel_store/op/no_op.cc \
+
 local_lib_inc_path :=   proto/task.proto \
                         ${LOCAL_PATH} \
                         ${TOPDIR}inc \
@@ -17,6 +23,7 @@ local_lib_inc_path :=   proto/task.proto \
                         ${TOPDIR}third_party/protobuf/include \
                         ${TOPDIR}inc/framework \
                         $(TOPDIR)framework/domi \
+                        $(TOPDIR)graphengine/ge \
 
 #compiler for host
 include $(CLEAR_VARS)
@@ -57,3 +64,84 @@ LOCAL_SRC_FILES := $(local_lib_src_files)
 LOCAL_C_INCLUDES := $(local_lib_inc_path)
 
 include ${BUILD_HOST_SHARED_LIBRARY}
+
+#compiler for libge_local_opskernel_builder.so
+include $(CLEAR_VARS)
+LOCAL_MODULE := libge_local_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=
+LOCAL_SHARED_LIBRARIES :=   libprotobuf \
+                            libc_sec \
+                            libslog \
+                            libregister \
+                            libgraph
+
+LOCAL_SRC_FILES := $(ops_kernel_builder_src_files)
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_HOST_SHARED_LIBRARY}
+
+
+#compiler for libge_local_opskernel_builder.so in atc
+include $(CLEAR_VARS)
+LOCAL_MODULE := atclib/libge_local_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=
+LOCAL_SHARED_LIBRARIES :=   libprotobuf \
+                            libc_sec \
+                            libslog \
+                            libregister \
+                            libgraph
+
+LOCAL_SRC_FILES := $(ops_kernel_builder_src_files)
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_HOST_SHARED_LIBRARY}
+
+#compiler for libge_local_opskernel_builder.a
+include $(CLEAR_VARS)
+LOCAL_MODULE := libge_local_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=   libprotobuf \
+                            libregister \
+                            libgraph \
+
+LOCAL_SHARED_LIBRARIES :=   libc_sec \
+                            libslog \
+
+LOCAL_SRC_FILES := $(ops_kernel_builder_src_files)
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_HOST_STATIC_LIBRARY}
+
+#compiler for device libge_local_opskernel_builder.a
+include $(CLEAR_VARS)
+LOCAL_MODULE := libge_local_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=   libprotobuf \
+                            libregister \
+                            libgraph \
+
+LOCAL_SHARED_LIBRARIES :=   libc_sec \
+                            libslog \
+
+LOCAL_SRC_FILES := $(ops_kernel_builder_src_files)
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_STATIC_LIBRARY}
diff --git a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.cc b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.cc
new file mode 100644
index 00000000..ef59d71b
--- /dev/null
+++ b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.cc
@@ -0,0 +1,174 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ge_local_ops_kernel_builder.h"
+#include <memory>
+#include "common/ge_inner_error_codes.h"
+#include "common/ge/ge_util.h"
+#include "framework/common/debug/ge_log.h"
+#include "graph/utils/node_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/utils/type_utils.h"
+#include "ge_local_engine/ops_kernel_store/op/op_factory.h"
+#include "ge_local_engine/common/constant/constant.h"
+#include "register/ops_kernel_builder_registry.h"
+
+namespace ge {
+namespace ge_local {
+REGISTER_OPS_KERNEL_BUILDER(kGeLocalOpKernelLibName, GeLocalOpsKernelBuilder);
+
+namespace {
+const char *const kConstantOpType = "Constant";
+const char *const kConstantOpAttrName = "value";
+const char *const kDataOpType = "Data";
+}  // namespace
+
+GeLocalOpsKernelBuilder::~GeLocalOpsKernelBuilder() { GELOGI("GeLocalOpsKernelBuilder destroyed"); }
+
+Status GeLocalOpsKernelBuilder::Initialize(const map<std::string, std::string> &options) { return SUCCESS; }
+
+Status GeLocalOpsKernelBuilder::Finalize() { return SUCCESS; }
+
+Status GeLocalOpsKernelBuilder::CalcOpRunningParam(Node &ge_node) {
+  GELOGD("[%s] CalcOpRunningParam In.", ge_node.GetName().c_str());
+  OpDescPtr op_desc = ge_node.GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(FAILED, "CalcOpRunningParam failed, as op desc is null");
+    return FAILED;
+  }
+
+  bool is_shape_unknown = false;
+  if (NodeUtils::GetNodeUnknownShapeStatus(ge_node, is_shape_unknown) == GRAPH_SUCCESS) {
+    if (is_shape_unknown) {
+      GELOGI("op:%s is unknown shape, does not need to calc output size.", ge_node.GetName().c_str());
+      return SUCCESS;
+    }
+  }
+
+  const string node_name = ge_node.GetName();
+  const string node_type = ge_node.GetType();
+  size_t output_size = op_desc->GetOutputsSize();
+  GELOGD("Calc op[%s:%s] running param, output size=%zu.", node_name.c_str(), node_type.c_str(), output_size);
+
+  for (size_t i = 0; i < output_size; ++i) {
+    GeTensorDesc output_tensor = op_desc->GetOutputDesc(static_cast<uint32_t>(i));
+    Format format = output_tensor.GetFormat();
+    DataType data_type = output_tensor.GetDataType();
+
+    int64_t mem_size = 0;
+    graphStatus graph_status = TensorUtils::GetSize(output_tensor, mem_size);
+    // If mem size has been set, no need reset.
+    if ((graph_status == GRAPH_SUCCESS) && (mem_size > 0) && (data_type != DT_STRING)) {
+      GELOGD("Op[%s:%s] out[%zu] mem size has been set, no need calc again, format=%s, data_type=%s, mem_size=%ld.",
+             node_name.c_str(), node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
+             TypeUtils::DataTypeToSerialString(data_type).c_str(), mem_size);
+      continue;
+    }
+
+    int64_t output_mem_size = 0;
+    GeShape output_shape = output_tensor.GetShape();
+    if ((node_type == kConstantOpType) && (data_type == DT_STRING)) {
+      graph_status = CalcConstantStrMemSize(op_desc, output_mem_size);
+    } else if (node_type == kDataOpType) {
+      int64_t o_size = 0;
+      graph_status = TensorUtils::GetTensorMemorySizeInBytes(output_tensor, o_size);
+      output_mem_size = o_size;
+    } else {
+      graph_status = TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size);
+    }
+
+    if (graph_status != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "Calc op[%s:%s] out[%zu] mem size failed, format=%s, data_type=%s, error=%u.", node_name.c_str(),
+             node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
+             TypeUtils::DataTypeToSerialString(data_type).c_str(), graph_status);
+      return FAILED;
+    }
+
+    if (output_mem_size < 0) {
+      GELOGE(FAILED,
+             "Calc op[%s:%s] out[%zu] mem size is negative(not support),"
+             " format=%s, data_type=%s, mem_size=%ld.",
+             node_name.c_str(), node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
+             TypeUtils::DataTypeToSerialString(data_type).c_str(), output_mem_size);
+      return FAILED;
+    }
+    GELOGI(
+      "Calc op[%s:%s] out[%zu] mem size is %ld,"
+      " format=%s, data_type=%s.",
+      node_name.c_str(), node_type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(),
+      TypeUtils::DataTypeToSerialString(data_type).c_str());
+
+    TensorUtils::SetSize(output_tensor, output_mem_size);
+
+    graph_status = op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor);
+    if (graph_status != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "Update op[%s:%s] out[%zu] desc failed, format=%s, data_type=%s, error=%u.", node_name.c_str(),
+             node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
+             TypeUtils::DataTypeToSerialString(data_type).c_str(), graph_status);
+      return FAILED;
+    }
+  }
+  GELOGD("Calc op[%s:%s] running param success.", node_name.c_str(), node_type.c_str());
+  return SUCCESS;
+}
+
+Status GeLocalOpsKernelBuilder::CalcConstantStrMemSize(const OpDescPtr &op_desc, int64_t &mem_size) {
+  if (op_desc == nullptr) {
+    GELOGE(FAILED, "CalcConstantStrMemSize failed, as op desc is null");
+    return FAILED;
+  }
+  ConstGeTensorPtr value = MakeShared<const GeTensor>();
+  if (value == nullptr) {
+    GELOGE(FAILED, "make shared ConstGeTensor exception.");
+    return FAILED;
+  }
+  // Constant op attr name is "value"
+  if (!AttrUtils::GetTensor(op_desc, kConstantOpAttrName, value)) {
+    GELOGE(FAILED, "Get Constant op attr value failed");
+    return FAILED;
+  }
+  mem_size = static_cast<int64_t>(value->GetData().size());
+  return SUCCESS;
+}
+
+Status GeLocalOpsKernelBuilder::GenerateTask(const Node &node, RunContext &context, std::vector<domi::TaskDef> &tasks) {
+  bool is_shape_unknown = false;
+  if (NodeUtils::GetNodeUnknownShapeStatus(node, is_shape_unknown) == GRAPH_SUCCESS) {
+    if (is_shape_unknown) {
+      GELOGI("op:%s is unknown shape, does not need to generate task", node.GetName().c_str());
+      return SUCCESS;
+    }
+  }
+  string name = node.GetName();
+  string type = node.GetType();
+  GELOGD("Ge local generate task for node:%s(%s) begin, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size());
+
+  auto op = OpFactory::Instance().CreateOp(node, context);
+  if (op == nullptr) {
+    GELOGE(FAILED, "CreateOp for node:%s(%s) failed.", name.c_str(), type.c_str());
+    return FAILED;
+  }
+
+  Status ret = op->Run();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Node:%s(%s) op run failed.", name.c_str(), type.c_str());
+    return ret;
+  }
+  GELOGI("Ge local generate task for node:%s(%s) end, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size());
+  return ret;
+}
+}  // namespace ge_local
+}  // namespace ge
diff --git a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.h b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.h
new file mode 100644
index 00000000..8cb20451
--- /dev/null
+++ b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_builder.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GE_LOCAL_ENGINE_OPS_KERNEL_UTILS_GE_LOCAL_OPS_KERNEL_UTILS_H_
+#define GE_GE_LOCAL_ENGINE_OPS_KERNEL_UTILS_GE_LOCAL_OPS_KERNEL_UTILS_H_
+
+#include "external/ge/ge_api_error_codes.h"
+#include "common/opskernel/ops_kernel_builder.h"
+
+namespace ge {
+namespace ge_local {
+class GeLocalOpsKernelBuilder : public OpsKernelBuilder {
+ public:
+  ~GeLocalOpsKernelBuilder() override;
+  Status Initialize(const map<std::string, std::string> &options) override;
+
+  Status Finalize() override;
+
+  Status CalcOpRunningParam(Node &node) override;
+
+  Status GenerateTask(const Node &node, RunContext &context, std::vector<domi::TaskDef> &tasks) override;
+
+ private:
+  /**
+   * Calc memSize for constant which type is DT_STRING.
+   * @param op_desc OpDesc information
+   * @param mem_size output size
+   * @return whether this operation success
+   */
+  Status CalcConstantStrMemSize(const OpDescPtr &op_desc, int64_t &mem_size);
+};
+}  // namespace ge_local
+}  // namespace ge
+
+#endif  // GE_GE_LOCAL_ENGINE_OPS_KERNEL_UTILS_GE_LOCAL_OPS_KERNEL_UTILS_H_
diff --git a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc
index adf936c0..c4c7be32 100644
--- a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc
+++ b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc
@@ -26,11 +26,6 @@
 #include "op/op_factory.h"
 #include "proto/task.pb.h"
 
-namespace {
-const char *const kConstantOpType = "Constant";
-const char *const kConstantOpAttrName = "value";
-const char *const kDataOpType = "Data";
-}  // namespace
 namespace ge {
 namespace ge_local {
 using domi::TaskDef;
@@ -63,136 +58,8 @@ Status GeLocalOpsKernelInfoStore::Finalize() {
   return SUCCESS;
 }
 
-Status GeLocalOpsKernelInfoStore::CalcOpRunningParam(Node &ge_node) {
-  OpDescPtr op_desc = ge_node.GetOpDesc();
-  if (op_desc == nullptr) {
-    GELOGE(FAILED, "CalcOpRunningParam failed, as op desc is null");
-    return FAILED;
-  }
-
-  bool is_shape_unknown = false;
-  if (NodeUtils::GetNodeUnknownShapeStatus(ge_node, is_shape_unknown) == GRAPH_SUCCESS) {
-    if (is_shape_unknown) {
-      GELOGI("op:%s is unknown shape, does not need to calc output size.", ge_node.GetName().c_str());
-      return SUCCESS;
-    }
-  }
-
-  const string node_name = ge_node.GetName();
-  const string node_type = ge_node.GetType();
-  size_t output_size = op_desc->GetOutputsSize();
-  GELOGD("Calc op[%s:%s] running param, output size=%zu.", node_name.c_str(), node_type.c_str(), output_size);
-
-  for (size_t i = 0; i < output_size; ++i) {
-    GeTensorDesc output_tensor = op_desc->GetOutputDesc(static_cast<uint32_t>(i));
-    Format format = output_tensor.GetFormat();
-    DataType data_type = output_tensor.GetDataType();
-
-    int64_t mem_size = 0;
-    graphStatus graph_status = TensorUtils::GetSize(output_tensor, mem_size);
-    // If mem size has been set, no need reset.
-    if ((graph_status == GRAPH_SUCCESS) && (mem_size > 0) && (data_type != DT_STRING)) {
-      GELOGD("Op[%s:%s] out[%zu] mem size has been set, no need calc again, format=%s, data_type=%s, mem_size=%ld.",
-             node_name.c_str(), node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
-             TypeUtils::DataTypeToSerialString(data_type).c_str(), mem_size);
-      continue;
-    }
-
-    int64_t output_mem_size = 0;
-    GeShape output_shape = output_tensor.GetShape();
-    if ((node_type == kConstantOpType) && (data_type == DT_STRING)) {
-      graph_status = CalcConstantStrMemSize(op_desc, output_mem_size);
-    } else if (node_type == kDataOpType) {
-      int64_t output_size = 0;
-      graph_status = TensorUtils::GetTensorMemorySizeInBytes(output_tensor, output_size);
-      output_mem_size = output_size;
-    } else {
-      graph_status = TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size);
-    }
-
-    if (graph_status != GRAPH_SUCCESS) {
-      GELOGE(FAILED, "Calc op[%s:%s] out[%zu] mem size failed, format=%s, data_type=%s, error=%u.", node_name.c_str(),
-             node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
-             TypeUtils::DataTypeToSerialString(data_type).c_str(), graph_status);
-      return FAILED;
-    }
-
-    if (output_mem_size < 0) {
-      GELOGE(FAILED,
-             "Calc op[%s:%s] out[%zu] mem size is negative(not support),"
-             " format=%s, data_type=%s, mem_size=%ld.",
-             node_name.c_str(), node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
-             TypeUtils::DataTypeToSerialString(data_type).c_str(), output_mem_size);
-      return FAILED;
-    }
-    GELOGI(
-      "Calc op[%s:%s] out[%zu] mem size is %ld,"
-      " format=%s, data_type=%s.",
-      node_name.c_str(), node_type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(),
-      TypeUtils::DataTypeToSerialString(data_type).c_str());
-
-    TensorUtils::SetSize(output_tensor, output_mem_size);
-
-    graph_status = op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor);
-    if (graph_status != GRAPH_SUCCESS) {
-      GELOGE(FAILED, "Update op[%s:%s] out[%zu] desc failed, format=%s, data_type=%s, error=%u.", node_name.c_str(),
-             node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
-             TypeUtils::DataTypeToSerialString(data_type).c_str(), graph_status);
-      return FAILED;
-    }
-  }
-  GELOGD("Calc op[%s:%s] running param success.", node_name.c_str(), node_type.c_str());
-  return SUCCESS;
-}
-
-Status GeLocalOpsKernelInfoStore::CalcConstantStrMemSize(const OpDescPtr &op_desc, int64_t &mem_size) {
-  if (op_desc == nullptr) {
-    GELOGE(FAILED, "CalcConstantStrMemSize failed, as op desc is null");
-    return FAILED;
-  }
-  ConstGeTensorPtr value = MakeShared<const GeTensor>();
-  if (value == nullptr) {
-    GELOGE(FAILED, "make shared ConstGeTensor exception.");
-    return FAILED;
-  }
-  // Constant op attr name is "value"
-  if (!AttrUtils::GetTensor(op_desc, kConstantOpAttrName, value)) {
-    GELOGE(FAILED, "Get Constant op attr value failed");
-    return FAILED;
-  }
-  mem_size = static_cast<int64_t>(value->GetData().size());
-  return GRAPH_SUCCESS;
-}
-
 void GeLocalOpsKernelInfoStore::GetAllOpsKernelInfo(map<string, OpInfo> &infos) const { infos = op_info_map_; }
 
-Status GeLocalOpsKernelInfoStore::GenerateTask(const Node &node, RunContext &context, vector<TaskDef> &tasks) {
-  bool is_shape_unknown = false;
-  if (NodeUtils::GetNodeUnknownShapeStatus(node, is_shape_unknown) == GRAPH_SUCCESS) {
-    if (is_shape_unknown) {
-      GELOGI("op:%s is unknown shape, does not need to generate task", node.GetName().c_str());
-      return SUCCESS;
-    }
-  }
-  string name = node.GetName();
-  string type = node.GetType();
-  GELOGD("Ge local generate task for node:%s(%s) begin, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size());
-
-  auto op = OpFactory::Instance().CreateOp(node, context);
-  if (op == nullptr) {
-    GELOGE(FAILED, "CreateOp for node:%s(%s) failed.", name.c_str(), type.c_str());
-    return FAILED;
-  }
-
-  Status ret = op->Run();
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Node:%s(%s) op run failed.", name.c_str(), type.c_str());
-    return ret;
-  }
-  GELOGI("Ge local generate task for node:%s(%s) end, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size());
-  return ret;
-}
-
 bool GeLocalOpsKernelInfoStore::CheckSupported(const OpDescPtr &op_desc, std::string &) const {
   if (op_desc == nullptr) {
     return false;
diff --git a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.h b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.h
index ce123751..711052b7 100644
--- a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.h
+++ b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.h
@@ -58,22 +58,6 @@ class GeLocalOpsKernelInfoStore : public OpsKernelInfoStore {
    */
   void GetAllOpsKernelInfo(std::map<std::string, ge::OpInfo> &infos) const override;
 
-  /**
-   * Calc the running size of Operator,
-   * then GE will alloc the mem size from runtime
-   * @param ge_node Node information
-   * @return status whether this operation success
-   */
-  Status CalcOpRunningParam(ge::Node &ge_node) override;
-
-  /**
-   * call the runtime's interface to generate the task
-   * @param node Node information
-   * @param context run context info
-   * @return status whether this operation success
-   */
-  Status GenerateTask(const ge::Node &ge_node, ge::RunContext &context, std::vector<domi::TaskDef> &tasks) override;
-
   /**
    * Create session
    * @param session_options Session Options
@@ -101,14 +85,6 @@ class GeLocalOpsKernelInfoStore : public OpsKernelInfoStore {
   GeLocalOpsKernelInfoStore &operator=(GeLocalOpsKernelInfoStore &&ops_kernel_store) = delete;
 
  private:
-  /**
-   * Calc memSize for constant which type is DT_STRING.
-   * @param op_desc OpDesc information
-   * @param mem_size output size
-   * @return whether this operation success
-   */
-  Status CalcConstantStrMemSize(const OpDescPtr &op_desc, int64_t &mem_size);
-
   // store op name and OpInfo key-value pair
   std::map<std::string, ge::OpInfo> op_info_map_;
 };
diff --git a/src/ge/ge_local_engine/ops_kernel_store/op/op.h b/src/ge/ge_local_engine/ops_kernel_store/op/op.h
index 1b184dad..cc73c01a 100644
--- a/src/ge/ge_local_engine/ops_kernel_store/op/op.h
+++ b/src/ge/ge_local_engine/ops_kernel_store/op/op.h
@@ -21,10 +21,10 @@
 #include <string>
 #include <vector>
 #include "common/ge_inner_error_codes.h"
-#include "common/opskernel/ops_kernel_info_types.h"
 #include "graph/node.h"
 
 namespace ge {
+struct RunContext;
 namespace ge_local {
 /**
  * The base class for all op.
diff --git a/src/ge/ge_runner.mk b/src/ge/ge_runner.mk
index 956bab0b..6c448a46 100644
--- a/src/ge/ge_runner.mk
+++ b/src/ge/ge_runner.mk
@@ -89,7 +89,9 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/manager/graph_mem_allocator.cc \
     graph/manager/graph_caching_allocator.cc \
     graph/manager/graph_var_manager.cc \
+    graph/manager/host_mem_manager.cc \
     graph/manager/rdma_pool_allocator.cc \
+    graph/manager/memory_api.cc \
     graph/manager/model_manager/event_manager.cc        \
     graph/manager/trans_var_data_utils.cc \
     graph/manager/util/debug.cc                       \
@@ -109,6 +111,7 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/passes/mark_same_addr_pass.cc \
     graph/passes/mark_graph_unknown_status_pass.cc \
     graph/partition/dynamic_shape_partition.cc \
+    graph/partition/stage_partition.cc \
     graph/passes/base_pass.cc \
     graph/passes/bitcast_pass.cc \
     graph/passes/cast_remove_pass.cc \
@@ -179,6 +182,7 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/passes/multi_batch_pass.cc \
     graph/passes/multi_batch_clone_pass.cc \
     graph/passes/subexpression_migration_pass.cc \
+    graph/passes/subgraph_const_migration_pass.cc \
     graph/passes/unused_args_clean_pass.cc \
     graph/passes/net_output_pass.cc \
     graph/passes/next_iteration_pass.cc \
@@ -240,6 +244,7 @@ LIBGE_LOCAL_SRC_FILES := \
     model/ge_root_model.cc \
     omm/csa_interact.cc \
     opskernel_manager/ops_kernel_manager.cc \
+    opskernel_manager/ops_kernel_builder_manager.cc \
     session/inner_session.cc \
     session/session_manager.cc \
     single_op/single_op.cc \
@@ -313,6 +318,7 @@ RUNNER_LOCAL_C_INCLUDES := \
     $(TOPDIR)libc_sec/include \
     $(TOPDIR)ops/built-in/op_proto/inc \
     $(TOPDIR)framework/domi/analyzer \
+    $(TOPDIR)graphengine/ge/analyzer \
     $(TOPDIR)toolchain/ide/ide-daemon/external \
     proto/fwk_adapter.proto \
     proto/ge_ir.proto \
@@ -369,7 +375,6 @@ LOCAL_SHARED_LIBRARIES := \
     libmsprof \
     liberror_manager \
 
-
 LOCAL_LDFLAGS := -lrt -ldl
 
 LOCAL_SHARED_LIBRARIES += \
@@ -396,7 +401,6 @@ LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)
 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc \
                    ../../out/ge/lib64/stub/ge_prof.cc \
 
-
 LOCAL_SHARED_LIBRARIES :=
 
 LOCAL_LDFLAGS := -lrt -ldl
diff --git a/src/ge/generator/ge_generator.cc b/src/ge/generator/ge_generator.cc
index bef93333..db52ce59 100644
--- a/src/ge/generator/ge_generator.cc
+++ b/src/ge/generator/ge_generator.cc
@@ -222,7 +222,7 @@ static void GetOpsProtoPath(string &opsproto_path) {
 
 class GeGenerator::Impl {
  public:
-  Impl(OmgContext &omg_context) : omg_context_(omg_context), graph_manager_(omg_context) {}
+  Impl(OmgContext &omg_context) : omg_context_(omg_context) {}
   ~Impl() = default;
 
   Status BuildModel(const Graph &graph, const vector<GeTensor> &inputs, GeRootModelPtr &ge_models);
@@ -524,9 +524,19 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
   return SUCCESS;
 }
 
-Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
-                                  const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
-                                  bool is_offline) {
+namespace {
+bool IsNeedConnectInputOpForSingleOp(GeTensorDesc &tensor_desc) {
+  bool is_need = true;
+  // format and dtype is all reserved, stand for Optional input. When singleop scene
+  if (tensor_desc.GetFormat() == FORMAT_RESERVED && tensor_desc.GetDataType() == DT_UNDEFINED) {
+    is_need = false;
+  }
+  return is_need;
+}
+}  // namespace
+
+Status GeGenerator::CheckForSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs,
+                                     const vector<GeTensor> &outputs) {
   GE_CHECK_NOTNULL_EXEC(op_desc, return PARAM_INVALID);
   if (!inputs.empty() && (inputs.size() != op_desc->GetAllInputsSize())) {
     GELOGE(PARAM_INVALID, "Tensor size: %zu, Inputs size: %zu", inputs.size(), op_desc->GetAllInputsSize());
@@ -536,7 +546,16 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
     GELOGE(PARAM_INVALID, "Tensor size: %zu, Outputs size: %zu", outputs.size(), op_desc->GetOutputsSize());
     return PARAM_INVALID;
   }
+  return SUCCESS;
+}
 
+Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
+                                  const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
+                                  bool is_offline) {
+  if (CheckForSingleOp(op_desc, inputs, outputs) != SUCCESS) {
+    GELOGE(PARAM_INVALID, "input param is invalid when build single op!");
+    return PARAM_INVALID;
+  }
   OmgContext &omg_context = (impl_ == nullptr) ? domi::GetContext() : impl_->omg_context_;
   omg_context.is_dynamic_input = ContainsDynamicInpus(*op_desc);
 
@@ -571,12 +590,18 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
   if (inputs.empty()) {
     for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
       GE_CHECK_NOTNULL_EXEC(input_desc, return INTERNAL_ERROR);
+      if (!IsNeedConnectInputOpForSingleOp(*input_desc)) {
+        continue;
+      }
       GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false));
       arg_index++;
     }
   } else {
     for (const auto &in_desc : inputs) {
       GeTensorDesc input_desc = in_desc.GetTensorDesc();
+      if (!IsNeedConnectInputOpForSingleOp(input_desc)) {
+        continue;
+      }
       GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, input_desc, arg_index, true));
       arg_index++;
     }
@@ -679,7 +704,7 @@ Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector<GeTensor>
   static std::atomic<GraphId> atomic_graph_id(0);
   auto graph_id = atomic_graph_id.fetch_add(1);
   const std::map<std::string, std::string> options;
-  Status ret = graph_manager_.AddGraph(graph_id, graph, options);
+  Status ret = graph_manager_.AddGraph(graph_id, graph, options, omg_context_);
   if (ret != SUCCESS) {
     GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph fail, graph id: %u", graph_id);
     (void)graph_manager_.Finalize();
@@ -712,7 +737,7 @@ Status GeGenerator::Impl::GenerateInfershapeGraph(const Graph &graph) {
   static std::atomic<GraphId> atomic_graph_id(0);
   auto graph_id = atomic_graph_id.fetch_add(1);
   const std::map<std::string, std::string> options;
-  Status ret = graph_manager_.AddGraph(graph_id, graph, options);
+  Status ret = graph_manager_.AddGraph(graph_id, graph, options, omg_context_);
   if (ret != SUCCESS) {
     GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph failed, graph id: %u", graph_id);
     (void)graph_manager_.Finalize();
diff --git a/src/ge/generator/generator_api.cc b/src/ge/generator/generator_api.cc
index 3f92f1a2..0f003e90 100644
--- a/src/ge/generator/generator_api.cc
+++ b/src/ge/generator/generator_api.cc
@@ -116,7 +116,7 @@ Status_t OpTaskGernerator(const char *op_type, const OpTensor_t *in_tensor, int
   CHECK_PARAM_NOT_NULL(om_file);
   const std::string om_file_name(om_file);
 
-  std::string op_name = std::string(op_type) + "_" + std::to_string(ge::GetCurrentTimestap());
+  std::string op_name = std::string(op_type) + "_" + std::to_string(ge::GetCurrentTimestamp());
   ge::OpDescPtr op_desc = ge::MakeShared<ge::OpDesc>(op_name, op_type);
   if (op_desc == nullptr) {
     return ge::FAILED;
diff --git a/src/ge/graph/build/graph_builder.cc b/src/ge/graph/build/graph_builder.cc
index 27d0b13f..480f20c4 100644
--- a/src/ge/graph/build/graph_builder.cc
+++ b/src/ge/graph/build/graph_builder.cc
@@ -17,25 +17,77 @@
 #include "graph/build/graph_builder.h"
 #include "common/ge/ge_util.h"
 #include "common/helper/model_helper.h"
-#include "common/opskernel/ops_kernel_info_types.h"
 #include "graph/build/logical_stream_allocator.h"
 #include "graph/build/run_context.h"
 #include "graph/build/stream_graph_optimizer.h"
+#include "graph/common/ge_call_wrapper.h"
+#include "graph/ge_context.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/passes/mark_same_addr_pass.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/type_utils.h"
-#include "graph/common/ge_call_wrapper.h"
 #include "init/gelib.h"
 #include "model/ge_model.h"
 #include "graph/ge_context.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 using domi::BuildMode;
 
 namespace {
 const int32_t kInvalidPerfLevel = -1;
+enum NodeType { kSubgraphData, kSubgraphNode, kOthers };
 }  // namespace
 namespace ge {
+NodeType TransferNodeType(const NodePtr &node) {
+  const std::string type = node->GetType();
+  if (type == ge::DATA) {
+    if (node->GetOwnerComputeGraph()->GetParentNode() == nullptr) {
+      GELOGD("access src data node:%s", node->GetName().c_str());
+      return kOthers;
+    }
+    GELOGD("access subgraph input node:%s", node->GetName().c_str());
+    return kSubgraphData;
+  } else if (type == PARTITIONEDCALL) {
+    GELOGD("access subgraph node:%s", node->GetName().c_str());
+    return kSubgraphNode;
+  }
+  GELOGD("access other node:%s", node->GetName().c_str());
+  return kOthers;
+}
+
+Status HandleSubgraphNode(NodePtr &src_node, OutDataAnchorPtr &src_out_anchor) {
+  auto subgraph = NodeUtils::GetSubgraph(*src_node, 0);
+  GE_CHECK_NOTNULL(subgraph);
+  const NodePtr &net_output_node = subgraph->FindFirstNodeMatchType(NETOUTPUT);
+  GE_CHECK_NOTNULL(net_output_node);
+  const InDataAnchorPtr &in_data_anchor = net_output_node->GetInDataAnchor(src_out_anchor->GetIdx());
+  GE_CHECK_NOTNULL(in_data_anchor);
+  const OutDataAnchorPtr &peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+  GE_CHECK_NOTNULL(peer_out_anchor);
+
+  src_node = peer_out_anchor->GetOwnerNode();
+  src_out_anchor = peer_out_anchor;
+  return SUCCESS;
+}
+
+Status HandleSubgraphDataNode(NodePtr &src_node, OutDataAnchorPtr &src_out_anchor) {
+  uint32_t index = 0;
+  if (!AttrUtils::GetInt(src_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, index)) {
+    GELOGE(FAILED, "Get attr ATTR_NAME_PARENT_NODE_INDEX failed, node:%s.", src_node->GetName().c_str());
+    return FAILED;
+  }
+  const NodePtr &parent_node = src_node->GetOwnerComputeGraph()->GetParentNode();
+  GE_CHECK_NOTNULL(parent_node);
+  const InDataAnchorPtr &in_data_anchor = parent_node->GetInDataAnchor(index);
+  GE_CHECK_NOTNULL(in_data_anchor);
+  const OutDataAnchorPtr &peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+  GE_CHECK_NOTNULL(peer_out_anchor);
+
+  src_node = peer_out_anchor->GetOwnerNode();
+  src_out_anchor = peer_out_anchor;
+  return SUCCESS;
+}
+
 GraphBuilder::GraphBuilder() : build_mode_(BuildMode::GEN_TASK_WITH_FUSION), hcom_parallel_(false) {}
 
 void GraphBuilder::SetOptions(const ge::GraphManagerOptions &options) {
@@ -72,23 +124,18 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) {
       }
     }
 
-    OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kernel_lib_name);
-    if (kernel_info != nullptr) {
-      auto ret = SetInputSize(node_ptr);
-      if (ret != SUCCESS) {
-        GELOGE(ret, "Set node inputDesc size failed, node name is %s", node_ptr->GetName().c_str());
-        return ret;
-      }
-      ret = kernel_info->CalcOpRunningParam(*node_ptr);
-      if (ret != SUCCESS) {
-        GELOGE(ret, "Calculate op running param failed, node name is %s", node_ptr->GetName().c_str());
-        return ret;
-      }
-      GE_CHK_STATUS_RET(AddOutputMemTypeForNode(node_ptr));
-    } else {
-      GELOGE(GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", node_ptr->GetName().c_str());
-      return INTERNAL_ERROR;
+    auto ret = SetInputSize(node_ptr);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Set node inputDesc size failed, node name is %s", node_ptr->GetName().c_str());
+      return ret;
+    }
+
+    ret = OpsKernelBuilderManager::Instance().CalcOpRunningParam(*node_ptr);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Calculate op running param failed, node name is %s", node_ptr->GetName().c_str());
+      return ret;
     }
+    GE_CHK_STATUS_RET(AddOutputMemTypeForNode(node_ptr));
   }
 
   auto parent_node = graph->GetParentNode();
@@ -321,6 +368,11 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr
     GELOGE(INTERNAL_ERROR, "Get memory size fail.");
     return INTERNAL_ERROR;
   }
+  int64_t p2p_memory_size = 0;
+  if (!AttrUtils::GetInt(model_ptr, ATTR_MODEL_P2P_MEMORY_SIZE, p2p_memory_size)) {
+    GELOGE(INTERNAL_ERROR, "Get p2p memory size fail.");
+    return INTERNAL_ERROR;
+  }
   int64_t weight_size = 0;
   if (!AttrUtils::GetInt(model_ptr, ATTR_MODEL_WEIGHT_SIZE, weight_size)) {
     GELOGE(INTERNAL_ERROR, "Get weight memory size fail.");
@@ -331,11 +383,21 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr
   auto *get_mem_base = reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(var_manager->GetVarMemMaxSize()));
   uint8_t *get_weight_mem_base = get_mem_base;
   if (weight_size > 0) {
-    get_weight_mem_base = get_mem_base + memory_size;
+    get_weight_mem_base = get_mem_base + memory_size + p2p_memory_size;
   }
-
+  std::map<int64_t, uint8_t *> mem_type_to_data_mem_base;
+  mem_type_to_data_mem_base[RT_MEMORY_HBM] = get_mem_base;
+  if (p2p_memory_size == 0) {
+    mem_type_to_data_mem_base[RT_MEMORY_P2P_DDR] = nullptr;
+  } else {
+    mem_type_to_data_mem_base[RT_MEMORY_P2P_DDR] = get_mem_base + memory_size;
+  }
+  std::map<int64_t, uint64_t> mem_type_to_data_mem_size;
+  mem_type_to_data_mem_size[RT_MEMORY_HBM] = memory_size;
+  mem_type_to_data_mem_size[RT_MEMORY_P2P_DDR] = p2p_memory_size;
   RunContextUtil run_context;
-  Status ret = run_context.InitMemInfo(get_mem_base, memory_size, get_weight_mem_base, weight_size);
+  Status ret = run_context.InitMemInfo(get_mem_base, memory_size, mem_type_to_data_mem_base, mem_type_to_data_mem_size,
+                                       get_weight_mem_base, weight_size);
   if (ret != SUCCESS) {
     GELOGE(ret, "task_generator init mem info fail.");
     return ret;
@@ -500,22 +562,50 @@ Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge:
 }
 
 Status GraphBuilder::AddOutputMemTypeForNode(const NodePtr &node) {
-  int64_t mem_type;
-  if (AttrUtils::GetInt(node->GetOpDesc(), ATTR_INPUT_MEMORY_TYPE, mem_type)) {
-    GELOGD("[%s] has attr input_memory_type %ld", node->GetName().c_str(), mem_type);
-    for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
-      const auto &peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
-      GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue);
-      const auto &src_node = peer_out_anchor->GetOwnerNode();
-      const auto &src_op = src_node->GetOpDesc();
-      GE_IF_BOOL_EXEC(src_op == nullptr, continue);
-      if (!AttrUtils::SetInt(src_op, ATTR_OUTPUT_MEMORY_TYPE, mem_type)) {
-        GELOGE(INTERNAL_ERROR, "Set out_memory_type attr failed.");
+  auto op_desc = node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  uint32_t mem_type;
+  if (!AttrUtils::GetInt(op_desc, ATTR_INPUT_MEMORY_TYPE, mem_type)) {
+    return SUCCESS;
+  }
+  GELOGD("[%s] has attr input_memory_type %ld", op_desc->GetName().c_str(), mem_type);
+  for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+    const auto &peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+    GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue);
+    bool valid_flag = false;
+    auto src_node = peer_out_anchor->GetOwnerNode();
+    auto src_out_anchor = peer_out_anchor;
+    while (true) {
+      const auto &src_desc = src_node->GetOpDesc();
+      GE_IF_BOOL_EXEC(src_desc == nullptr, continue);
+      GELOGD("[%s:%u] set attr output_memory_type %ld", src_desc->GetName().c_str(), src_out_anchor->GetIdx(),
+             mem_type);
+      if (!AttrUtils::SetInt(src_desc->MutableOutputDesc(src_out_anchor->GetIdx()), ATTR_OUTPUT_MEMORY_TYPE,
+                             mem_type)) {
+        GELOGE(INTERNAL_ERROR, "Set out_memory_type attr for [%s:%d] failed.", src_desc->GetName().c_str(),
+               src_out_anchor->GetIdx());
         return INTERNAL_ERROR;
       }
-      return SUCCESS;
+      switch (TransferNodeType(src_node)) {
+        case kSubgraphNode:
+          GE_CHK_STATUS_RET(HandleSubgraphNode(src_node, src_out_anchor), "Handle subgraph node %s failed",
+                            src_node->GetName().c_str());
+          break;
+        case kSubgraphData:
+          GE_CHK_STATUS_RET(HandleSubgraphDataNode(src_node, src_out_anchor), "Handle Data node %s in subgraph failed",
+                            src_node->GetName().c_str());
+          break;
+        case kOthers:
+        default:
+          valid_flag = true;
+          break;
+      }
+      if (valid_flag) {
+        break;
+      }
     }
   }
+
   return SUCCESS;
 }
 }  // namespace ge
diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc
index 773eac6a..19138b90 100644
--- a/src/ge/graph/build/memory/block_mem_assigner.cc
+++ b/src/ge/graph/build/memory/block_mem_assigner.cc
@@ -401,12 +401,14 @@ string MemoryBlock::String() {
   for (const auto &symbol : SymbolList()) {
     ss << "__symbol: " << symbol << " ";
   }
+  ss << "memory_type: " << memory_type_ << " ";
   return ss.str();
 }
 
 BlockMemAssigner::BlockMemAssigner(ComputeGraphPtr compute_graph, const map<string, string> &anchor_to_symbol,
                                    const map<string, list<NodeIndexIO>> &symbol_to_anchors)
     : mem_offset_(0),
+      p2p_mem_offset_(0),
       compute_graph_(std::move(compute_graph)),
       symbol_to_anchors_(symbol_to_anchors),
       anchor_to_symbol_(anchor_to_symbol),
@@ -508,6 +510,7 @@ bool IsDirectOutputNode(const NodePtr &node, int idx) {
 void AddReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &reusable_block_counts) {
   string key = std::to_string(mem_block.Size());
   key += "_" + std::to_string(mem_block.stream_id_);
+  key += "_" + std::to_string(mem_block.memory_type_);
   auto it = reusable_block_counts.find(key);
   if (it != reusable_block_counts.end()) {
     it->second++;
@@ -519,6 +522,7 @@ void AddReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &
 void ReduceReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &reusable_block_counts) {
   string key = std::to_string(mem_block.Size());
   key += "_" + std::to_string(mem_block.stream_id_);
+  key += "_" + std::to_string(mem_block.memory_type_);
   auto it = reusable_block_counts.find(key);
   if (it != reusable_block_counts.end()) {
     if (it->second > 0) {
@@ -535,6 +539,7 @@ bool CanReuseBySize(const map<string, uint64_t> &reusable_block_counts, const Me
   } else {
     string key = std::to_string(reusable_block.Size());
     key += "_" + std::to_string(reusable_block.stream_id_);
+    key += "_" + std::to_string(reusable_block.memory_type_);
     auto it = reusable_block_counts.find(key);
     GE_IF_BOOL_EXEC(
       (it != reusable_block_counts.end() && (it->second > kReuseMaxCount)) && (reusable_block.Size() > block_size),
@@ -545,7 +550,8 @@ bool CanReuseBySize(const map<string, uint64_t> &reusable_block_counts, const Me
 }
 
 bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name,
-                                                   uint32_t &peer_input_index, bool &no_need_assign_memory) {
+                                                   uint32_t &peer_input_index, bool &no_need_assign_memory,
+                                                   bool &reset_zero_copy_flag) {
   if (n == nullptr || n->GetAllOutDataAnchors().size() <= 0) {
     return false;
   }
@@ -571,6 +577,13 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
                       return false;);
 
       // If GetBool fail, is_input_continuous is false.
+      bool is_input_continuous_no_padding = false;
+      (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT,
+                                   is_input_continuous_no_padding);
+      if (is_input_continuous_no_padding) {
+        reset_zero_copy_flag = true;
+        return false;
+      }
       (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
 
       GE_IF_BOOL_EXEC(is_input_continuous && CheckIsZeroMemNodeType(peer_node->GetType()),
@@ -613,6 +626,18 @@ void BlockMemAssigner::InitReuseFlag() {
     std::string symbol = pair.first;
     bool pre_reuse_flag = true;
     bool post_reuse_flag = true;
+    // default memory type
+    int64_t mem_type = RT_MEMORY_HBM;
+    GetSymbolMemType(pair.second, mem_type);
+    GELOGD("The memory type of symbol[%s] is [%ld]].", symbol.c_str(), mem_type);
+    if (mem_type == RT_MEMORY_P2P_DDR) {
+      UpdateOpTensorMemType(pair.second, mem_type);
+    }
+    // Only the memory with special requirements is processed. The HBM uses the default processing mode.
+    if (mem_type == RT_MEMORY_P2P_DDR) {
+      symbol_to_mem_type_[symbol] = mem_type;
+    }
+
     for (const auto &node_index_io : pair.second) {
       if (node_index_io.io_type_ == kIn) {
         continue;
@@ -728,6 +753,66 @@ void BlockMemAssigner::PrintSymbolMap() {
   }
 }
 
+void BlockMemAssigner::GetSymbolMemType(std::list<NodeIndexIO> node_index_io_list, int64_t &memory_type) {
+  memory_type = RT_MEMORY_HBM;
+  vector<int64_t> memory_types;
+  for (auto &node_index_io : node_index_io_list) {
+    auto op_desc = node_index_io.node_->GetOpDesc();
+    if (op_desc == nullptr) {
+      GELOGW("Node[%s] op desc is null.", node_index_io.node_->GetName().c_str());
+      return;
+    }
+
+    if (node_index_io.io_type_ == kIn) {
+      vector<int64_t> input_memory_types;
+      (void)ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, input_memory_types);
+      if (!input_memory_types.empty() && node_index_io.index_ < input_memory_types.size()) {
+        int64_t input_memory_type = input_memory_types[node_index_io.index_];
+        GELOGD("Node[%s]: the memory type of input index [%u] is [%ld]].", op_desc->GetName().c_str(),
+               node_index_io.index_, input_memory_type);
+        memory_types.emplace_back(input_memory_type);
+      }
+    }
+    if (node_index_io.io_type_ == kOut) {
+      vector<int64_t> output_memory_types;
+      (void)ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, output_memory_types);
+      if (!output_memory_types.empty() && node_index_io.index_ < output_memory_types.size()) {
+        int64_t output_memory_type = output_memory_types[node_index_io.index_];
+        GELOGD("Node[%s]: the memory type of output index [%u] is [%ld]].", op_desc->GetName().c_str(),
+               node_index_io.index_, output_memory_type);
+        memory_types.emplace_back(output_memory_type);
+      }
+    }
+  }
+
+  // memory priority
+  for (auto node_memory_type : memory_types) {
+    if (node_memory_type > memory_type) {
+      memory_type = node_memory_type;
+    }
+  }
+}
+
+void BlockMemAssigner::UpdateOpTensorMemType(std::list<NodeIndexIO> node_index_io_list, int64_t memory_type) {
+  for (auto &node_index_io : node_index_io_list) {
+    auto op_desc = node_index_io.node_->GetOpDesc();
+    if (op_desc == nullptr) {
+      GELOGW("Node[%s] op desc is null.", node_index_io.node_->GetName().c_str());
+      return;
+    }
+
+    if (node_index_io.io_type_ == kIn) {
+      auto input_desc = op_desc->MutableInputDesc(node_index_io.index_);
+      (void)AttrUtils::SetInt(input_desc, ATTR_NAME_TENSOR_MEM_TYPE, memory_type);
+    }
+
+    if (node_index_io.io_type_ == kOut) {
+      auto output_desc = op_desc->MutableOutputDesc(node_index_io.index_);
+      (void)AttrUtils::SetInt(output_desc, ATTR_NAME_TENSOR_MEM_TYPE, memory_type);
+    }
+  }
+}
+
 bool BlockMemAssigner::IsContinuousOutput(const NodePtr &n) {
   if (n == nullptr) {
     GELOGE(FAILED, "Node is null.");
@@ -774,9 +859,9 @@ bool BlockMemAssigner::IsZeroCopyBlock(const NodePtr &node, bool continuous) {
 }
 
 MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, size_t no_align_size,
-                                           MemoryType mem_type, const NodePtr &n, uint32_t out_index,
+                                           OpMemoryType mem_type, const NodePtr &n, uint32_t out_index,
                                            const vector<bool> &workspace_reuse_flag, const bool is_op_reuse_mem,
-                                           const bool continuous) {
+                                           const bool continuous, int64_t memory_type) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null.");
   auto node_op_desc = n->GetOpDesc();
   GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr);
@@ -789,8 +874,9 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
     is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) && !node_op_desc->HasAttr(kOpNoReuseMem) &&
                       reuse_mem_flag && is_op_reuse_mem && (IsPreReuse(n, out_index));
     auto stream_id = node_op_desc->GetStreamId();
-    if (is_reuse_memory && !continuous) {
-      for (auto it = reusable_blocks_[stream_id].begin(); it != reusable_blocks_[stream_id].end(); ++it) {
+    if (is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty()) {
+      for (auto it = reusable_blocks_[memory_type][stream_id].begin();
+           it != reusable_blocks_[memory_type][stream_id].end(); ++it) {
         MemoryBlock *reusable_block = *it;
         if (!IsPostReuse(reusable_block)) {
           reusable_block->reuse_mem_ = false;
@@ -810,14 +896,14 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
           reusable_block->continuous_block_ = continuous;
           reusable_block->ref_count_++;
           ReduceReusableBlockCount(*reusable_block, reusable_block_counts_);
-          reusable_blocks_[stream_id].erase(it);
+          reusable_blocks_[memory_type][stream_id].erase(it);
           return reusable_block;
         }
       }
     }
   }
 
-  auto block = new (std::nothrow) MemoryBlock(block_size, node_op_desc->GetStreamId(), is_reuse_memory);
+  auto block = new (std::nothrow) MemoryBlock(block_size, node_op_desc->GetStreamId(), is_reuse_memory, memory_type);
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "new an object failed.");
 
   // Data and netoutput need zero copy block
@@ -834,6 +920,8 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
     }
   }
   memory_blocks_.emplace_back(block);
+  // cause memory_blocks_ may reduce when swap after,
+  // create blocks_store_ to assure blocks deleted finally
   blocks_store_.emplace_back(block);
   return block;
 }
@@ -845,11 +933,13 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null.");
   MemoryBlock *block = nullptr;
   int64_t total_size = 0;
+  int64_t memory_type = RT_MEMORY_HBM;
   for (uint32_t index = 0; index < static_cast<uint32_t>(node_op_desc->GetOutputsSize()); index++) {
     auto output_op_desc = node_op_desc->GetOutputDescPtr(index);
     if (output_op_desc == nullptr) {
       return nullptr;
     }
+
     int64_t size = 0;
     if (ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS) {
       GELOGI("Get size failed");
@@ -863,6 +953,18 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
     if (index != 0) {
       zero_memory_list_.emplace_back(n, kOutput, index);
     }
+
+    if (index == 0) {
+      NodeIndexIO node_index_io(n, index, kOut);
+      auto iter = anchor_to_symbol_.find(node_index_io.ToString());
+      if (iter != anchor_to_symbol_.end()) {
+        string symbol = iter->second;
+        if (symbol_to_mem_type_.find(symbol) != symbol_to_mem_type_.end()) {
+          memory_type = symbol_to_mem_type_[symbol];
+          GELOGD("Continuous out memory symbol is [%s], memory type is [%ld]", symbol.c_str(), memory_type);
+        }
+      }
+    }
   }
 
   auto block_size = GetBlockSize(total_size, ranges);
@@ -870,7 +972,8 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
          block_size);
 
   vector<bool> workspace_reuse_flag;
-  block = ApplyMemory(block_size, total_size, total_size, kOutput, n, 0, workspace_reuse_flag, is_op_reuse_mem, true);
+  block = ApplyMemory(block_size, total_size, total_size, kOutput, n, 0, workspace_reuse_flag, is_op_reuse_mem, true,
+                      memory_type);
   if (block != nullptr) {
     // hccl task need align header and tail
     block->first_continuous_block_ = true;
@@ -902,17 +1005,23 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
     block->ref_count_++;
   } else {
     int64_t max_size = size;
+    int64_t memory_type = RT_MEMORY_HBM;
     auto iter1 = anchor_to_symbol_.find(node_index_io.ToString());
     if (iter1 != anchor_to_symbol_.end()) {
       auto iter2 = symbol_size_.find(iter1->second);
       if (iter2 != symbol_size_.end()) {
         max_size = iter2->second;
       }
+      auto iter3 = symbol_to_mem_type_.find(iter1->second);
+      if (iter3 != symbol_to_mem_type_.end()) {
+        memory_type = iter3->second;
+      }
     }
+
     auto block_size = GetBlockSize(max_size, ranges);
     vector<bool> workspace_reuse_flag;
     block = ApplyMemory(block_size, size, no_align_size, kOutput, n, index, workspace_reuse_flag, is_op_reuse_mem,
-                        continuous);
+                        continuous, memory_type);
   }
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "Block is nullptr.");
   int out_count_reuse_input = block->ref_count_;
@@ -1126,8 +1235,10 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
   // If GetBool fail, is_atomic is false.
   (void)ge::AttrUtils::GetBool(op_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic);
   // Allocate memory for the current node and release node memory of the same size in the workspace
-  GE_IF_BOOL_EXEC(ge_disable_reuse_mem_env_ != "1",
-                  ReleaseMemorys(stream_workspace_blocks_[stream_id], reusable_blocks_[stream_id]));
+  GE_IF_BOOL_EXEC(
+    ge_disable_reuse_mem_env_ != "1",
+    for (auto iter = stream_workspace_blocks_.begin(); iter != stream_workspace_blocks_.end();
+         ++iter) { ReleaseMemorys(iter->second[stream_id], reusable_blocks_[iter->first][stream_id]); });
   if (IsContinuousOutput(node)) {
     (void)ApplyContinuousMemory(node, ranges, is_op_reuse_mem_);
     return SUCCESS;
@@ -1148,10 +1259,11 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
     std::string peer_name;
     uint32_t peer_input_index = 0;
     bool out_node_set_continuous_input = false;
+    bool reset_zero_copy_flag = false;
     bool no_need_assign_memory = ((size == 0) || CheckIsZeroMemNodeType(node->GetType()));
     if (!no_need_assign_memory) {
       out_node_set_continuous_input =
-        IsOutNodeSetContinuousInput(node, i, peer_name, peer_input_index, no_need_assign_memory);
+        IsOutNodeSetContinuousInput(node, i, peer_name, peer_input_index, no_need_assign_memory, reset_zero_copy_flag);
       GE_IF_BOOL_EXEC(!no_need_assign_memory,
                       no_need_assign_memory = IsAtomicOutputMemory(node, i, is_atomic, out_node_set_continuous_input););
     }
@@ -1165,8 +1277,11 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
     if (need_change) {
       is_op_reuse_mem_ = false;
     }
+
     MemoryBlock *mem_block = ApplyOutMemory(node, i, ranges, is_op_reuse_mem_, out_node_set_continuous_input);
     if (mem_block != nullptr) {
+      GE_IF_BOOL_EXEC(reset_zero_copy_flag, mem_block->is_zero_copy_ = false; GELOGI(
+                        "Node[%s] output[%u] need assign memory before reassign.", op_desc->GetName().c_str(), i););
       node_out_blocks_[node->GetName()].emplace_back(mem_block);
       if (out_node_set_continuous_input) {
         node_continuous_input_blocks_[peer_name][peer_input_index] = mem_block;
@@ -1204,45 +1319,53 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
     if (AssignOutputMemoryWithReuse(n, ranges) != SUCCESS) {
       return;
     }
-
-    stream_workspace_blocks_[stream_id].clear();
+    for (auto iter = stream_workspace_blocks_.begin(); iter != stream_workspace_blocks_.end(); ++iter) {
+      iter->second[stream_id].clear();
+    }
     vector<int64_t> temp;
     GetNodeWorkSpaceSize(n, temp);
     vector<int64_t> workspace_bytes;
-    vector<int64_t> workspace_memory_type;
-    bool has_workspace_mem_type_attr =
-      ge::AttrUtils::GetListInt(node_op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_memory_type);
+    vector<int64_t> tvm_workspace_memory_type;
+    bool has_tvm_workspace_mem_type_attr =
+      ge::AttrUtils::GetListInt(node_op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, tvm_workspace_memory_type);
     vector<bool> workspace_reuse_flag;
     GE_IF_BOOL_EXEC(!ge::AttrUtils::GetListBool(node_op_desc, kAttrNameWorkspaceReuseFlag, workspace_reuse_flag),
                     GELOGD("OP %s get workspace_reuse_flag attr failed", node_op_desc->GetName().c_str()));
     GELOGI("Assign memory node[%s], size [temp:%zu, memory type size:%zu]", node_op_desc->GetName().c_str(),
-           temp.size(), workspace_memory_type.size());
+           temp.size(), tvm_workspace_memory_type.size());
 
-    if (has_workspace_mem_type_attr && (temp.size() != workspace_memory_type.size())) {
-      GELOGE(INTERNAL_ERROR, "fusion: node[%s], workspace_memory size err![v_temp:%zu, workspace:%zu]",
-             n->GetName().c_str(), temp.size(), workspace_memory_type.size());
+    if (has_tvm_workspace_mem_type_attr && (temp.size() != tvm_workspace_memory_type.size())) {
+      GELOGE(INTERNAL_ERROR, "fusion: node[%s], tvm workspace memory size error![v_temp:%zu, workspace:%zu]",
+             n->GetName().c_str(), temp.size(), tvm_workspace_memory_type.size());
       return;
     }
     for (size_t i = 0; i < temp.size(); i++) {
       // fusion: other type's size not means malloc HBM memory
       bool workspace_skip_flag = false;
-      if (has_workspace_mem_type_attr && workspace_memory_type[i] == RT_MEMORY_L1) {
+      if (has_tvm_workspace_mem_type_attr && tvm_workspace_memory_type[i] == RT_MEMORY_L1) {
         GELOGI(
           "fusion: node[%s]workspace index[%d] is not hbm type, add to zero_memory_list, workspace memory type [%ld]",
-          node_op_desc->GetName().c_str(), i, workspace_memory_type[i]);
+          node_op_desc->GetName().c_str(), i, tvm_workspace_memory_type[i]);
         workspace_skip_flag = true;
       }
       if (temp[i] == 0 || workspace_skip_flag) {
         zero_memory_list_.emplace_back(n, kWorkspace, static_cast<uint32_t>(i), false);
         continue;
       }
-      MemoryBlock *mem_block = ApplyMemory(GetBlockSize(static_cast<size_t>(temp[i]), ranges),
-                                           static_cast<size_t>(temp[i]), static_cast<size_t>(temp[i]), kWorkspace, n,
-                                           static_cast<uint32_t>(i), workspace_reuse_flag, is_op_reuse_mem_, false);
+      int64_t memory_type = RT_MEMORY_HBM;
+      if (!GetWorkSpaceMemoryType(n, i, memory_type)) {
+        GELOGW("Get workspace memory type failed.");
+        return;
+      }
+      MemoryBlock *mem_block = ApplyMemory(
+        GetBlockSize(static_cast<size_t>(temp[i]), ranges), static_cast<size_t>(temp[i]), static_cast<size_t>(temp[i]),
+        kWorkspace, n, static_cast<uint32_t>(i), workspace_reuse_flag, is_op_reuse_mem_, false, memory_type);
       GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mem_block == nullptr, continue, "failed to apply memory block.");
-      CheckWorkspaceReuse(workspace_reuse_flag, i, stream_id, mem_block);
+      CheckWorkspaceReuse(workspace_reuse_flag, i, stream_id, mem_block, memory_type);
+    }
+    for (auto it = reusable_blocks_.begin(); it != reusable_blocks_.end(); ++it) {
+      ReleaseInputNodeOutMemory(node_out_blocks_, it->second[stream_id], n);
     }
-    ReleaseInputNodeOutMemory(node_out_blocks_, reusable_blocks_[stream_id], n);
   }
 
   GELOGD("Assigned memory blocks:");
@@ -1265,11 +1388,11 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
 }
 
 void BlockMemAssigner::CheckWorkspaceReuse(const vector<bool> &workspace_reuse_flag, uint32_t index, int64_t stream_id,
-                                           MemoryBlock *mem_block) {
+                                           MemoryBlock *mem_block, int64_t memory_type) {
   bool reuse_mem_flag =
     ((workspace_reuse_flag.size() > index) && (workspace_reuse_flag[index] == false)) ? false : true;
   if (reuse_mem_flag) {
-    stream_workspace_blocks_[stream_id].emplace_back(mem_block);
+    stream_workspace_blocks_[memory_type][stream_id].emplace_back(mem_block);
   }
 }
 
@@ -1277,10 +1400,10 @@ void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector<int64_t>
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node->GetOpDesc() == nullptr, return, "Op desc is null.");
   vector<int64_t> workspace_byte_nums = node->GetOpDesc()->GetWorkspaceBytes();
 
-  GELOGD("GetNodeWorkSpaceSize: node[%s] size:%zu", node->GetOpDesc()->GetName().c_str(), workspace_byte_nums.size());
+  GELOGD("node[%s] size:%zu", node->GetOpDesc()->GetName().c_str(), workspace_byte_nums.size());
   for (int64_t byte_size : workspace_byte_nums) {
     workspace_memory.emplace_back(byte_size);
-    GELOGD("GetNodeWorkSpaceSize: push back size:%ld", byte_size);
+    GELOGD("push back size:%ld", byte_size);
   }
 }
 
@@ -1477,16 +1600,28 @@ void BlockMemAssigner::ResizeMemoryBlocks() {
     if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_) {
       continue;
     }
-    if (memory_block->first_continuous_block_) {
-      mem_offset_ += MEM_ALIGN_SIZE;
-    }
+    if (memory_block->memory_type_ == RT_MEMORY_HBM) {
+      if (memory_block->first_continuous_block_) {
+        mem_offset_ += MEM_ALIGN_SIZE;
+      }
 
-    memory_block->Resize();
-    memory_block->SetHeadOffset(mem_offset_);
-    mem_offset_ += memory_block->Size();
-    memory_block->SetTailOffset(mem_offset_ - 1);
+      memory_block->Resize();
+      memory_block->SetHeadOffset(mem_offset_);
+      mem_offset_ += memory_block->Size();
+      memory_block->SetTailOffset(mem_offset_ - 1);
+    } else if (memory_block->memory_type_ == RT_MEMORY_P2P_DDR) {
+      if (memory_block->first_continuous_block_) {
+        p2p_mem_offset_ += MEM_ALIGN_SIZE;
+      }
+
+      memory_block->Resize();
+      memory_block->SetHeadOffset(p2p_mem_offset_);
+      p2p_mem_offset_ += memory_block->Size();
+      memory_block->SetTailOffset(p2p_mem_offset_ - 1);
+    }
   }
-  GELOGI("mem_offset_ exclude zero_copy_memory is %zu.", mem_offset_);
+  GELOGI("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu.", mem_offset_,
+         p2p_mem_offset_);
 }
 
 ///
@@ -1617,4 +1752,19 @@ bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const {
          (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT) ||
          (node_type == HVDCALLBACKBROADCAST);
 }
+
+bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) {
+  memory_type = RT_MEMORY_HBM;
+  vector<int64_t> workspace_memory_type;
+  auto op_desc = node->GetOpDesc();
+  bool has_workspace_mem_type_attr =
+    ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_memory_type);
+  if (has_workspace_mem_type_attr && (workspace_memory_type.size() <= index)) {
+    GELOGE(INTERNAL_ERROR, "node[%s], workspace_memory size error![index:%zu, workspace:%zu]", node->GetName().c_str(),
+           index, workspace_memory_type.size());
+    return false;
+  }
+  memory_type = has_workspace_mem_type_attr ? workspace_memory_type[index] : RT_MEMORY_HBM;
+  return true;
+}
 }  // namespace ge
diff --git a/src/ge/graph/build/memory/block_mem_assigner.h b/src/ge/graph/build/memory/block_mem_assigner.h
index 6137911c..8dfb88c8 100644
--- a/src/ge/graph/build/memory/block_mem_assigner.h
+++ b/src/ge/graph/build/memory/block_mem_assigner.h
@@ -36,14 +36,14 @@ const size_t kMaxLifeTime = 0xffffffff;
 
 using DependStreamLife = std::map<int64_t, std::map<int64_t, size_t>>;
 
-enum MemoryType { kOutput, kWorkspace };
+enum OpMemoryType { kOutput, kWorkspace };
 
 struct NodeTypeIndex {
-  NodeTypeIndex(ge::NodePtr node, MemoryType mem_type, uint32_t index, bool ref_input = false)
+  NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false)
       : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input) {}
 
   ge::NodePtr node = nullptr;
-  MemoryType mem_type = kOutput;
+  OpMemoryType mem_type = kOutput;
   uint32_t index = 0;
   size_t life_time_end = kMaxLifeTime;
   bool ref_input = false;
@@ -59,7 +59,8 @@ struct NodeTypeIndex {
 
 class MemoryBlock {
  public:
-  explicit MemoryBlock(size_t block_size, int64_t stream_id = 0, bool reuse_mem = true)
+  explicit MemoryBlock(size_t block_size, int64_t stream_id = 0, bool reuse_mem = true,
+                       int64_t memory_type = RT_MEMORY_HBM)
       : ref_count_(0),
         stream_id_(stream_id),
         deleted_block_(false),
@@ -69,6 +70,7 @@ class MemoryBlock {
         first_continuous_block_(false),
         last_continuous_block_(false),
         is_zero_copy_(false),
+        memory_type_(memory_type),
         block_size_(block_size),
         head_offset_(0),
         tail_offset_(0),
@@ -83,7 +85,7 @@ class MemoryBlock {
     symbol_list_.clear();
   }
 
-  void Init(size_t real_size, MemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) {
+  void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) {
     real_size_list_.emplace_back(real_size);
     no_align_size_list_.emplace_back(no_align_size);
     node_type_index_list_.emplace_back(node, type, out_index, false);
@@ -144,6 +146,7 @@ class MemoryBlock {
   bool last_continuous_block_;
   bool is_zero_copy_;
   std::map<int64_t, size_t> depend_stream_life_;
+  int64_t memory_type_;
 
  private:
   size_t block_size_;
@@ -170,11 +173,13 @@ class BlockMemAssigner : public MemAssigner {
 
   Status Assign() override;
 
-  size_t GetMemOffset() const { return mem_offset_; };
+  size_t GetMemOffset() const { return mem_offset_; }
 
-  int64_t GetAtomicAddrCleanId() const { return atomic_addr_clean_id_; };
+  size_t GetP2PMemOffset() const { return p2p_mem_offset_; }
 
-  std::vector<MemoryBlock *> GetMemoryBlocks() const { return memory_blocks_; };
+  int64_t GetAtomicAddrCleanId() const { return atomic_addr_clean_id_; }
+
+  std::vector<MemoryBlock *> GetMemoryBlocks() const { return memory_blocks_; }
 
   ///
   /// @ingroup domi
@@ -254,7 +259,26 @@ class BlockMemAssigner : public MemAssigner {
   ///
   void PrintSymbolMap();
 
+  ///
+  /// @ingroup GE
+  /// @brief Get the memory type corresponding to the current symbol.
+  /// @param [in] node_index_io_list
+  /// @param [out] memory_type
+  /// @return void
+  ///
+  void GetSymbolMemType(std::list<NodeIndexIO> node_index_io_list, int64_t &memory_type);
+
+  ///
+  /// @ingroup GE
+  /// @brief Update input tensor or output tensor of op to new memory type attr.
+  /// @param [in] node_index_io_list
+  /// @param [in] memory_type
+  /// @return void
+  ///
+  void UpdateOpTensorMemType(std::list<NodeIndexIO> node_index_io_list, int64_t memory_type);
+
   size_t mem_offset_;
+  size_t p2p_mem_offset_;
 
   ge::ComputeGraphPtr compute_graph_;
 
@@ -269,14 +293,17 @@ class BlockMemAssigner : public MemAssigner {
   std::map<std::string, bool> pre_reuse_flag_;
   std::map<std::string, bool> post_reuse_flag_;
   std::map<std::string, size_t> symbol_size_;
+  std::map<std::string, int64_t> symbol_to_mem_type_;
 
  private:
   ///
   /// @ingroup GE
   /// @brief Traversing the compute_graph_ to apply for output memory while considering reuse
-  /// @param [in] n node in compute_graph_
-  /// @param [in] index output node index
-  /// @param [in] ranges available memory specifications
+  /// @param [in] n: node in compute_graph_
+  /// @param [in] index: output node index
+  /// @param [in] ranges: available memory specifications
+  /// @param [in] is_op_reuse_mem: Whether the op reuses the memory, true: reuse; false: not reuse
+  /// @param [in] continuous: Whether the op uses continuous memory
   /// @return MemoryBlock*
   /// @author
   ///
@@ -293,12 +320,15 @@ class BlockMemAssigner : public MemAssigner {
   /// @param [in] n node in compute_graph_
   /// @param [in] out_index output node index
   /// @param [in] workspace_reuse_flag reuse flag for workspace
+  /// @param [in] is_op_reuse_mem whether the op reuses memory
+  /// @param [in] continuous whether the memory of op is continuous
+  /// @param [in] memory_type device memory type
   /// @return MemoryBlock*
   /// @author
   ///
-  MemoryBlock *ApplyMemory(size_t block_size, size_t real_size, size_t no_align_size, MemoryType mem_type,
+  MemoryBlock *ApplyMemory(size_t block_size, size_t real_size, size_t no_align_size, OpMemoryType mem_type,
                            const ge::NodePtr &n, uint32_t out_index, const std::vector<bool> &workspace_reuse_flag,
-                           const bool is_op_reuse_mem, const bool continuous);
+                           const bool is_op_reuse_mem, const bool continuous, int64_t memory_type);
 
   ///
   /// @ingroup GE
@@ -307,11 +337,12 @@ class BlockMemAssigner : public MemAssigner {
   /// @param [in] index out index
   /// @param [in] stream_id which stream op in
   /// @param [in] mem_block node workspace mem_block
+  /// @param [in] memory_type workspace memory type
   /// @return void
   /// @author
   ///
   void CheckWorkspaceReuse(const vector<bool> &workspace_reuse_flag, uint32_t index, int64_t stream_id,
-                           MemoryBlock *mem_block);
+                           MemoryBlock *mem_block, int64_t memory_type);
 
   ///
   /// @ingroup GE
@@ -358,7 +389,7 @@ class BlockMemAssigner : public MemAssigner {
   bool IsZeroCopyBlock(const NodePtr &node, bool continuous);
 
   bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name,
-                                   uint32_t &peer_input_index, bool &no_need_assign_memory);
+                                   uint32_t &peer_input_index, bool &no_need_assign_memory, bool &reset_zero_copy_flag);
 
   ///
   /// @ingroup GE
@@ -373,13 +404,15 @@ class BlockMemAssigner : public MemAssigner {
 
   bool IsContinuousOutput(const NodePtr &n);
 
+  bool GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type);
+
   MemoryBlock *ApplyContinuousMemory(const NodePtr &n, const vector<int64_t> &ranges, const bool is_op_reuse_mem);
 
-  std::unordered_map<int64_t, std::vector<MemoryBlock *>> reusable_blocks_;
+  std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_;
 
   std::map<std::string, uint64_t> reusable_block_counts_;
 
-  std::unordered_map<int64_t, std::vector<MemoryBlock *>> stream_workspace_blocks_;
+  std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> stream_workspace_blocks_;
 
   std::unordered_map<std::string, std::vector<MemoryBlock *>> node_out_blocks_;
 
diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc
index 1cdb2efa..fbed5c77 100644
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@@ -96,7 +96,12 @@ Status GraphMemoryAssigner::AssignMemory() {
     return ge::FAILED;
   }
   MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner->GetMemOffset());
-  memory_offset_.push_back(memory_offset);
+  memory_offset_.emplace(RT_MEMORY_HBM, memory_offset);
+
+  if (mem_assigner->GetP2PMemOffset() > 0) {
+    MemoryOffset p2p_memory_offset(RT_MEMORY_P2P_DDR, mem_assigner->GetP2PMemOffset());
+    memory_offset_.emplace(RT_MEMORY_P2P_DDR, p2p_memory_offset);
+  }
 
   auto session_id = compute_graph_->GetSessionID();
   int64_t var_size_before_assign = ge::VarManager::Instance(session_id)->GetVarMemSize(RT_MEMORY_HBM);
@@ -232,7 +237,7 @@ Status GraphMemoryAssigner::GetMaxBatchLabel(const map<string, vector<NodePtr>>
   return SUCCESS;
 }
 
-Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offset) {
+Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_type_to_offset) {
   if (memory_offset_.empty()) {
     GELOGE(FAILED, "memory_offset_ is empty.");
     return ge::FAILED;
@@ -248,26 +253,32 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse
 
   GE_CHK_STATUS_RET(ReAssignAtomicMemory(is_loop_graph), "ReAssignAtomicMemory Failed!");
 
-  mem_offset = memory_offset_[0].mem_offset_;
+  size_t total_mem_offset = 0;
+  for (auto pair : memory_offset_) {
+    mem_type_to_offset[pair.first] = pair.second.mem_offset_;
+    total_mem_offset += pair.second.mem_offset_;
+  }
 
   auto session_id = compute_graph_->GetSessionID();
-  if (mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) {
-    GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", mem_offset,
+  if (total_mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) {
+    GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", total_mem_offset,
            VarManager::Instance(session_id)->GetGraphMemoryMaxSize());
-    ErrorManager::GetInstance().ATCReportErrMessage(
-      "E19022", {"size", "item", "maxsize"},
-      {std::to_string(mem_offset), "featuremap",
-       std::to_string(VarManager::Instance(session_id)->GetGraphMemoryMaxSize())});
+    for (auto iter : mem_type_to_offset) {
+      ErrorManager::GetInstance().ATCReportErrMessage(
+        "E19022", {"memType", "size", "item", "maxsize"},
+        {std::to_string(iter.first), std::to_string(iter.second), "featuremap",
+         std::to_string(VarManager::Instance(session_id)->GetGraphMemoryMaxSize())});
+    }
     return ge::FAILED;
   }
   return SUCCESS;
 }
 
-Status GraphMemoryAssigner::AssignZeroCopyMemory(size_t &mem_offset, size_t &zero_mem_copy_size) {
+Status GraphMemoryAssigner::AssignZeroCopyMemory(map<int64_t, size_t> &mem_offset, size_t &zero_mem_copy_size) {
   BlockMemAssignerPtr priority_assigner = std::move(mem_assigner_->GetPriorityAssinger());
   GE_IF_BOOL_EXEC(priority_assigner == nullptr, GELOGE(FAILED, "Get priority_assigner failed."); return ge::FAILED;);
 
-  size_t mem_offset_tmp = mem_offset;
+  size_t mem_offset_tmp = mem_offset[RT_MEMORY_HBM];
 
   // set offset for zero copy block
   for (auto &memory_block : priority_assigner->GetMemoryBlocks()) {
@@ -275,18 +286,24 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(size_t &mem_offset, size_t &zer
       continue;
     }
     memory_block->Resize();
-    memory_block->SetHeadOffset(mem_offset);
-    mem_offset += memory_block->Size();
-    memory_block->SetTailOffset(mem_offset - 1);
+    memory_block->SetHeadOffset(mem_offset[RT_MEMORY_HBM]);
+    mem_offset[RT_MEMORY_HBM] += memory_block->Size();
+    memory_block->SetTailOffset(mem_offset[RT_MEMORY_HBM] - 1);
   }
-  GELOGI("mem_offset_ include zero_copy_memory is %zu.", mem_offset);
+  GELOGI("mem_offset_ include zero_copy_memory is %zu.", mem_offset[RT_MEMORY_HBM]);
 
   // set offset for zero copy nodes
   priority_assigner->SetOpMemOffset(true);
-  zero_mem_copy_size = mem_offset - mem_offset_tmp;
-  memory_offset_[0].mem_offset_ = mem_offset;
+  zero_mem_copy_size = mem_offset[RT_MEMORY_HBM] - mem_offset_tmp;
+  auto iter = memory_offset_.find(RT_MEMORY_HBM);
+  if (iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[hbm].");
+    return FAILED;
+  }
+  iter->second.mem_offset_ = mem_offset[RT_MEMORY_HBM];
 
-  GELOGI("max_mem_offset:%zu, mem_offset:%zu, zero_mem_copy_size:%zu.", mem_offset, mem_offset_tmp, zero_mem_copy_size);
+  GELOGI("max_mem_offset:%zu, mem_offset:%zu, zero_mem_copy_size:%zu.", mem_offset[RT_MEMORY_HBM], mem_offset_tmp,
+         zero_mem_copy_size);
 
   return SUCCESS;
 }
@@ -303,9 +320,11 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
 
     // Assign continuous input memory
     if (is_input_continuous) {
+      int64_t memory_type = RT_MEMORY_HBM;
+      GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed.");
       int64_t mem_clean_start = 0;
       int64_t mem_clean_size = 0;
-      ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size);
+      ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type);
       if (ret != ge::SUCCESS) {
         GELOGE(ret, "Assign continuous input memory failed!");
         return ret;
@@ -360,17 +379,23 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
       }
     }
   }
-
-  GELOGI("After reassign continuous memory, memoffset = %zu.", memory_offset_[0].mem_offset_);
+  for (auto pair : memory_offset_) {
+    GELOGI("After reassign continuous memory, memory type = %ld, memoffset = %zu.", pair.first,
+           pair.second.mem_offset_);
+  }
   return ge::SUCCESS;
 }
 
 Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
-                                                        int64_t &continuous_mem_size) {
+                                                        int64_t &continuous_mem_size, int64_t memory_type) {
   GELOGI("Current node %s needs continuous input.", node->GetName().c_str());
-  continuous_mem_start = memory_offset_[0].mem_offset_;
   bool continuous_input_alloc = false;
   (void)ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, continuous_input_alloc);
+  auto iter = memory_offset_.find(memory_type);
+  if (iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[%ld].", memory_type);
+    return FAILED;
+  }
   for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
     auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
     GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, continue);
@@ -430,19 +455,20 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
           0, 0);
         continue;
       }
-      output_list.at(peer_out_data_anchor->GetIdx()) = memory_offset_[0].mem_offset_;
+
+      output_list.at(peer_out_data_anchor->GetIdx()) = iter->second.mem_offset_;
     } else {
       GELOGE(FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx());
       return FAILED;
     }
     peer_op_desc->SetOutputOffset(output_list);
-    size_t pre_mem_offset = memory_offset_[0].mem_offset_;
+    size_t pre_mem_offset = iter->second.mem_offset_;
 
     int64_t tensor_desc_size = 0;
     if (has_offset_attr) {
       if (peer_out_data_anchor->GetIdx() < static_cast<int>(offsets_for_fusion.size())) {
         auto offset_for_fusion = offsets_for_fusion[peer_out_data_anchor->GetIdx()];
-        memory_offset_[0].mem_offset_ += offset_for_fusion;
+        iter->second.mem_offset_ += offset_for_fusion;
       } else {
         GELOGE(FAILED, "fusion: peer node %s index : %d is out of range.", peer_op_desc->GetName().c_str(),
                peer_out_data_anchor->GetIdx());
@@ -453,25 +479,25 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
         TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size);
       GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;);
 
-      memory_offset_[0].mem_offset_ += tensor_desc_size;
+      iter->second.mem_offset_ += tensor_desc_size;
     }
 
     // If set tensor_actual_size, Memory alignment is not required.
     int32_t is_tensor_actual_size = 0;
     ge::AttrUtils::GetInt(peer_op_desc, ATTR_NAME_GET_TENSOR_ACTUAL_SIZE, is_tensor_actual_size);
     if (is_tensor_actual_size == 0) {
-      AlignMemOffset(MEM_ALIGN_SIZE);
+      AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
     }
     GELOGI(
       "[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] "
       "real_size[%ld].",
       node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx(),
-      pre_mem_offset, peer_op_desc->GetStreamId(), (memory_offset_[0].mem_offset_ - pre_mem_offset), tensor_desc_size);
+      pre_mem_offset, peer_op_desc->GetStreamId(), (iter->second.mem_offset_ - pre_mem_offset), tensor_desc_size);
   }
 
-  memory_offset_[0].mem_offset_ += MEM_ALIGN_SIZE;
+  iter->second.mem_offset_ += MEM_ALIGN_SIZE;
   if (!continuous_input_alloc) {
-    continuous_mem_size = memory_offset_[0].mem_offset_ - continuous_mem_start;
+    continuous_mem_size = iter->second.mem_offset_ - continuous_mem_start;
   }
   return SUCCESS;
 }
@@ -576,6 +602,7 @@ Status GraphMemoryAssigner::ReAssignVirtualInputNodeMemory(NodePtr node, size_t
 
 Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
   map<string, vector<NodePtr>> mem_reuse_virtual_input_nodes_map;
+  int64_t memory_type = RT_MEMORY_HBM;
   for (const auto &n : compute_graph_->GetAllNodes()) {
     OpDescPtr op_desc = n->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
@@ -585,7 +612,6 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
     bool attr_reuse = false;
     bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
     GE_IF_BOOL_EXEC(!get_reuse_flag, continue);
-
     if (attr_reuse && attr_continuous) {
       if (op_desc->GetOutputsSize() != kVirtualInputNodeOutputSize) {
         // When current virtual node has several outputs, can't directly determine which input is the tensor for reuse.
@@ -593,13 +619,19 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
                op_desc->GetOutputsSize());
         return FAILED;
       }
-
-      GELOGD("Start to reassign memory for virtual input node, memory offset = %zu.", memory_offset_[0].mem_offset_);
+      GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed.");
+      auto iter = memory_offset_.find(memory_type);
+      if (iter == memory_offset_.end()) {
+        GELOGE(FAILED, "Memory offset don't have memory type[%ld].", memory_type);
+        return FAILED;
+      }
+      GELOGD("Start to reassign memory for virtual input node, memory offset = %zu, memory type = %ld.",
+             iter->second.mem_offset_, memory_type);
       string batch_label_string;
       // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter
       (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
       if (batch_label_string.empty()) {
-        size_t node_mem_offset = memory_offset_[0].mem_offset_;
+        size_t node_mem_offset = iter->second.mem_offset_;
         // No ATTR_NAME_BATCH_LABEL, no need to reuse memory.
         Status status = ReAssignVirtualInputNodeMemory(n, node_mem_offset);
         if (status != SUCCESS) {
@@ -607,9 +639,10 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
           return FAILED;
         }
 
-        memory_offset_[0].mem_offset_ = node_mem_offset;
-        AlignMemOffset(MEM_ALIGN_SIZE);
-        GELOGD("After reassign memory for virtual input node, align memory = %zu.", memory_offset_[0].mem_offset_);
+        iter->second.mem_offset_ = node_mem_offset;
+        AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
+        GELOGD("After reassign memory for virtual input node, align memory = %zu, memory type = %ld.",
+               iter->second.mem_offset_, memory_type);
       } else {
         // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory.
         string current_node_full_name = op_desc->GetName();
@@ -709,6 +742,7 @@ Status GraphMemoryAssigner::ReAssignVirtualOutputNodeMemory(NodePtr node, size_t
 
 Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() {
   map<string, vector<NodePtr>> mem_reuse_virtual_output_nodes_map;
+  int64_t memory_type = RT_MEMORY_HBM;
   for (const auto &n : compute_graph_->GetAllNodes()) {
     OpDescPtr op_desc = n->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
@@ -727,22 +761,29 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() {
                in_data_anchor_list.size());
         return FAILED;
       }
-
-      GELOGD("Start to reassign memory for virtual output node, memory offset = %zu.", memory_offset_[0].mem_offset_);
+      GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed.");
+      auto iter = memory_offset_.find(memory_type);
+      if (iter == memory_offset_.end()) {
+        GELOGE(FAILED, "Memory offset don't have memory type[%ld].", memory_type);
+        return FAILED;
+      }
+      GELOGD("Start to reassign memory for virtual output node, memory offset = %zu, memory type = %ld.",
+             iter->second.mem_offset_, memory_type);
       string batch_label_string;
       // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter
       (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
       if (batch_label_string.empty()) {
-        size_t node_mem_offset = memory_offset_[0].mem_offset_;
+        size_t node_mem_offset = iter->second.mem_offset_;
         // No ATTR_NAME_BATCH_LABEL, no need to reuse memory.
         Status status = ReAssignVirtualOutputNodeMemory(n, node_mem_offset);
         if (status != SUCCESS) {
           GELOGE(FAILED, "Reassign memory of virtual output node failed, node name: %s.", n->GetName().c_str());
           return FAILED;
         }
-        memory_offset_[0].mem_offset_ = node_mem_offset;
-        AlignMemOffset(MEM_ALIGN_SIZE);
-        GELOGD("After reassign memory for virtual output node, align memory = %zu.", memory_offset_[0].mem_offset_);
+        iter->second.mem_offset_ = node_mem_offset;
+        AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
+        GELOGD("After reassign memory for virtual output node, align memory = %zu, memory type = %ld.",
+               iter->second.mem_offset_, memory_type);
       } else {
         // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory.
         string current_node_full_name = op_desc->GetName();
@@ -775,20 +816,23 @@ Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map<string, vector<NodePt
                                                        int32_t mem_reuse_model) {
   // Find max batch label value
   string max_batch_label;
-  if (GetMaxBatchLabel(mem_reuse_nodes_map, mem_reuse_model, max_batch_label) != SUCCESS) {
-    GELOGE(FAILED, "Get max batch label failed.");
-    return FAILED;
-  }
+  GE_CHK_STATUS_RET(GetMaxBatchLabel(mem_reuse_nodes_map, mem_reuse_model, max_batch_label),
+                    "Get max batch label failed.");
   GELOGI("The batch label of max batch virtual nodes is %s.", max_batch_label.c_str());
-
-  // Assign memory of max batch nodes that have the same batch label.
-  GELOGD("Start to reassign memory for max batch virtual nodes, memory offset = %zu.", memory_offset_[0].mem_offset_);
+  PrintMemoryOffset();
   vector<size_t> nodes_mem_offset_list;
   for (auto &i_map : mem_reuse_nodes_map) {
-    size_t max_batch_node_mem_offset = memory_offset_[0].mem_offset_;
-    nodes_mem_offset_list.emplace_back(max_batch_node_mem_offset);
-
     vector<NodePtr> virtual_nodes_list = i_map.second;
+    int64_t memory_type = RT_MEMORY_HBM;
+    GE_CHK_STATUS_RET(GetNodeListMemoryType(virtual_nodes_list, mem_reuse_model, memory_type),
+                      "Get node list memory type failed.");
+    auto iter = memory_offset_.find(memory_type);
+    if (iter == memory_offset_.end()) {
+      GELOGE(FAILED, "Memory offset don't have memory type[%ld].", memory_type);
+      return FAILED;
+    }
+    size_t max_batch_node_mem_offset = iter->second.mem_offset_;
+    nodes_mem_offset_list.emplace_back(max_batch_node_mem_offset);
     for (auto &i_node : virtual_nodes_list) {
       // Op_desc is not nullptr, it has been checked.
       OpDescPtr op_desc = i_node->GetOpDesc();
@@ -810,18 +854,16 @@ Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map<string, vector<NodePt
           GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str());
           return FAILED;
         }
-        memory_offset_[0].mem_offset_ = max_batch_node_mem_offset;
-        AlignMemOffset(MEM_ALIGN_SIZE);
-        GELOGD("After reassign memory for virtual node, align memory = %zu.", memory_offset_[0].mem_offset_);
+        iter->second.mem_offset_ = max_batch_node_mem_offset;
+        AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
+        GELOGD("After reassign memory for virtual node, align memory = %zu, memory type = %ld.",
+               iter->second.mem_offset_, memory_type);
         // Only assign memory of max batch nodes.
         break;
       }
     }
   }
-
-  // Assign memory of remaining nodes that have the same fixed_name.
-  GELOGD("Start to reassign memory for remaining batch virtual nodes, memory offset = %zu.",
-         memory_offset_[0].mem_offset_);
+  PrintMemoryOffset();
   size_t memory_reuse_index = 0;
   for (auto &i_map : mem_reuse_nodes_map) {
     vector<NodePtr> virtual_nodes_list = i_map.second;
@@ -856,8 +898,14 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {
     return status;
   }
 
+  auto mem_iter = memory_offset_.find(RT_MEMORY_HBM);
+  if (mem_iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[%ld].", RT_MEMORY_HBM);
+    return FAILED;
+  }
+
   for (auto &iter : normal_atomic_and_clean_nodes_map) {
-    int64_t atomic_mem_start = static_cast<int64_t>(memory_offset_[0].mem_offset_);
+    int64_t atomic_mem_start = static_cast<int64_t>(mem_iter->second.mem_offset_);
     GELOGD("Begin to reAssign atomic memory, atomic address memory start = %ld", atomic_mem_start);
 
     for (auto &atomic_node : iter.second) {
@@ -870,11 +918,10 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {
       }
     }
 
-    int64_t atomic_mem_size = static_cast<int64_t>(memory_offset_[0].mem_offset_) - atomic_mem_start;
-    status = SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size});
-    if (status != SUCCESS) {
-      GELOGE(status, "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str());
-      return status;
+    int64_t atomic_mem_size = static_cast<int64_t>(mem_iter->second.mem_offset_) - atomic_mem_start;
+    if (atomic_mem_size != 0) {
+      GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}),
+                        "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str());
     }
   }
 
@@ -972,6 +1019,11 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP
 }
 
 Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector<NodePtr> &connect_netoutput_nodes) {
+  auto iter = memory_offset_.find(RT_MEMORY_HBM);
+  if (iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[%ld].", RT_MEMORY_HBM);
+    return FAILED;
+  }
   for (auto &node : connect_netoutput_nodes) {
     GE_CHECK_NOTNULL(node);
     if (node->GetOpDesc() == nullptr) {
@@ -980,7 +1032,7 @@ Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector<NodePtr> &
     }
 
     // Atomic memory start addr
-    int64_t original_atomic_mem_start = static_cast<int64_t>(memory_offset_[0].mem_offset_);
+    int64_t original_atomic_mem_start = static_cast<int64_t>(iter->second.mem_offset_);
     GELOGD("Start to assign memory of atomic node, node name: %s, node type: %s, mem_offset: %ld.",
            node->GetName().c_str(), node->GetOpDesc()->GetType().c_str(), original_atomic_mem_start);
     vector<int64_t> mem_offset_end;
@@ -1096,6 +1148,11 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve
     return ge::FAILED;
   }
   auto output_list_size = static_cast<int64_t>(output_list.size());
+  auto iter = memory_offset_.find(RT_MEMORY_HBM);
+  if (iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[%ld].", RT_MEMORY_HBM);
+    return FAILED;
+  }
   for (auto &output_index : atomic_output_index) {
     if (output_index >= output_list_size) {
       GELOGE(ge::PARAM_INVALID, "The output index %ld is more than the size %ld of output_list.", output_index,
@@ -1125,14 +1182,14 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve
       GELOGI("Get size failed");
     }
 
-    output_list[output_index] = memory_offset_[0].mem_offset_;
+    output_list[output_index] = iter->second.mem_offset_;
     GELOGI("[IMAS]Atomic output : Set %s name[%s] output[%ld] offset to [%zu] stream_id[%ld] size[%ld] real_size[%ld].",
-           compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index, memory_offset_[0].mem_offset_,
+           compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index, iter->second.mem_offset_,
            op_desc->GetStreamId(), size, size);
 
-    memory_offset_[0].mem_offset_ += size;
-    AlignMemOffset(MEM_ALIGN_SIZE);
-    mem_offset_end.emplace_back(memory_offset_[0].mem_offset_);
+    iter->second.mem_offset_ += size;
+    AlignMemOffset(MEM_ALIGN_SIZE, RT_MEMORY_HBM);
+    mem_offset_end.emplace_back(iter->second.mem_offset_);
   }
 
   op_desc->SetOutputOffset(output_list);
@@ -1168,6 +1225,11 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc
                                                                 map<string, map<int64_t, int64_t>> &workspace_info,
                                                                 vector<int64_t> &mem_offset_end) {
   GELOGI("Begin to reassign normal atomic memory, node = %s.", op_desc->GetName().c_str());
+  auto mem_type_iter = memory_offset_.find(RT_MEMORY_HBM);
+  if (mem_type_iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[%ld].", RT_MEMORY_HBM);
+    return FAILED;
+  }
   vector<int64_t> workspace_vector = op_desc->GetWorkspace();
 
   for (auto iter = workspace_info.begin(); iter != workspace_info.end(); ++iter) {
@@ -1190,15 +1252,15 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc
         return ge::PARAM_INVALID;
       }
 
-      workspace_vector[workspace_index] = memory_offset_[0].mem_offset_;
+      workspace_vector[workspace_index] = mem_type_iter->second.mem_offset_;
       GELOGI(
         "[IMAS]Atomic ordinary workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] "
         "size[%ld] real_size[%ld].",
-        compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, memory_offset_[0].mem_offset_,
-        op_desc->GetStreamId(), workspace_size, workspace_size);
+        compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index,
+        mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size);
 
-      memory_offset_[0].mem_offset_ += workspace_size;
-      mem_offset_end.emplace_back(memory_offset_[0].mem_offset_);
+      mem_type_iter->second.mem_offset_ += workspace_size;
+      mem_offset_end.emplace_back(mem_type_iter->second.mem_offset_);
     }
   }
   op_desc->SetWorkspace(workspace_vector);
@@ -1210,6 +1272,11 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt
                                                               map<string, map<int64_t, int64_t>> &workspace_info,
                                                               vector<int64_t> &mem_offset_end) {
   GELOGI("Begin to reassign fusion atomic memory, node = %s.", op_desc->GetName().c_str());
+  auto mem_type_iter = memory_offset_.find(RT_MEMORY_HBM);
+  if (mem_type_iter == memory_offset_.end()) {
+    GELOGE(FAILED, "Memory offset don't have memory type[%ld].", RT_MEMORY_HBM);
+    return FAILED;
+  }
   map<string, map<int64_t, int64_t>> sub_node_workspace_offset;
 
   for (auto &iter : workspace_info) {
@@ -1222,15 +1289,15 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt
       auto workspace_index = static_cast<uint64_t>(info_iter.first);
       auto workspace_size = info_iter.second;
 
-      size_t workspace_offset = memory_offset_[0].mem_offset_;
+      size_t workspace_offset = mem_type_iter->second.mem_offset_;
       GELOGI(
         "[IMAS]Atomic fusion workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] size[%ld] "
         "real_size[%ld].",
-        compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, memory_offset_[0].mem_offset_,
-        op_desc->GetStreamId(), workspace_size, workspace_size);
+        compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index,
+        mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size);
 
-      memory_offset_[0].mem_offset_ += workspace_size;
-      mem_offset_end.emplace_back(memory_offset_[0].mem_offset_);
+      mem_type_iter->second.mem_offset_ += workspace_size;
+      mem_offset_end.emplace_back(mem_type_iter->second.mem_offset_);
       index_offset.insert(std::make_pair(workspace_index, workspace_offset));
     }
     sub_node_workspace_offset.insert(std::make_pair(iter.first, index_offset));
@@ -1295,8 +1362,11 @@ ge::Status GraphMemoryAssigner::SetInputOffset() {
     GELOGE(FAILED, "memory_offset_ is empty.");
     return FAILED;
   }
-  GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu]", compute_graph_->GetName().c_str(),
-          memory_offset_[0].mem_offset_);
+  for (auto pair : memory_offset_) {
+    GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memory type[%ld]", compute_graph_->GetName().c_str(),
+            pair.second.mem_offset_, pair.first);
+  }
+
   for (const ge::NodePtr &node : compute_graph_->GetAllNodes()) {
     if (UpdateOpInputOffset(node) != ge::SUCCESS) {
       GELOGE(ge::FAILED, "Update op input offset failed");
@@ -1374,7 +1444,7 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector<
     auto out_index = static_cast<unsigned long>(peer_out_anchor->GetIdx());
     if (output_list.size() > static_cast<size_t>(out_index)) {
       int64_t input_offset = output_list.at(out_index);
-      if (has_mem_type_attr) {
+      if (has_mem_type_attr && !origin_input_list.empty()) {
         auto input_size = tmp_op_desc->GetInputsSize();
         auto ori_input_offset_list_size = origin_input_list.size();
         auto mem_type_size = memory_type.size();
@@ -1470,7 +1540,6 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in
     memory_offset_size.emplace_back(size);
   }
   memory_offset_start.pop_back();
-
   const auto &in_control_anchor = node->GetInControlAnchor();
   if (!memory_offset_size.empty() && in_control_anchor != nullptr) {
     for (auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
@@ -1543,11 +1612,92 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve
   return SUCCESS;
 }
 
-void GraphMemoryAssigner::AlignMemOffset(const int64_t &mem_align_size) {
+void GraphMemoryAssigner::AlignMemOffset(const int64_t &mem_align_size, int64_t memory_type) {
   if (mem_align_size <= 0) {
     return;
   }
-  memory_offset_[0].mem_offset_ =
-    (memory_offset_[0].mem_offset_ + mem_align_size - 1) / mem_align_size * mem_align_size;
+  auto iter = memory_offset_.find(memory_type);
+  if (iter == memory_offset_.end()) {
+    GELOGW("Memory offset don't have memory type[%ld].", memory_type);
+    return;
+  }
+  iter->second.mem_offset_ = (iter->second.mem_offset_ + mem_align_size - 1) / mem_align_size * mem_align_size;
+}
+
+ge::Status GraphMemoryAssigner::GetNodeListMemoryType(const vector<NodePtr> &nodes, int32_t mem_reuse_model,
+                                                      int64_t &memory_type) {
+  memory_type = RT_MEMORY_HBM;
+  // In the dynamic batch scenario, the memory attributes of nodes are the same.
+  for (auto &n : nodes) {
+    if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
+      GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed.")
+      break;
+    }
+
+    if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
+      GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed.");
+      break;
+    }
+  }
+  return SUCCESS;
+}
+
+ge::Status GraphMemoryAssigner::GetNodeMemoryType(const NodePtr &node, int64_t &memory_type, string input_or_output) {
+  memory_type = RT_MEMORY_HBM;
+  vector<int64_t> mem_type_list;
+  if (input_or_output == "input") {
+    (void)ge::AttrUtils::GetListInt(node->GetOpDesc(), ATTR_NAME_INPUT_MEM_TYPE_LIST, mem_type_list);
+  }
+  if (input_or_output == "output") {
+    (void)ge::AttrUtils::GetListInt(node->GetOpDesc(), ATTR_NAME_OUTPUT_MEM_TYPE_LIST, mem_type_list);
+  }
+  if (mem_type_list.empty()) {
+    if (memory_offset_.find(memory_type) == memory_offset_.end()) {
+      GELOGE(FAILED, "Memory offset map does not have memory type[%ld].", memory_type);
+      return FAILED;
+    }
+    return SUCCESS;
+  }
+
+  if (mem_type_list.size() != node->GetAllInDataAnchorsSize()) {
+    GELOGE(FAILED, "The size[%zu] of mem type list is not equal to the size of in data anchor[%u].",
+           mem_type_list.size(), node->GetAllInDataAnchorsSize());
+    return FAILED;
+  }
+
+  if (!CheckContinuousMemType(mem_type_list)) {
+    GELOGE(FAILED, "Check continuous memory type failed.");
+    return FAILED;
+  }
+  // It is continuous memory and memory type is the same, so use the first memory.
+  memory_type = mem_type_list[0];
+  return SUCCESS;
+}
+
+bool GraphMemoryAssigner::CheckContinuousMemType(vector<int64_t> mem_type_list) {
+  if (mem_type_list.size() == 0) {
+    return true;
+  }
+  int64_t mem_type_tmp = mem_type_list[0];
+  for (auto mem_type : mem_type_list) {
+    if (mem_type != mem_type_tmp) {
+      GELOGW("The memory is continuous, but the type of the input memory is inconsistent. They are [%ld] and [%ld].",
+             mem_type_tmp, mem_type);
+      return false;
+    }
+  }
+  if (memory_offset_.find(mem_type_tmp) == memory_offset_.end()) {
+    GELOGW("Memory offset map does not have memory type[%ld].", mem_type_tmp);
+    return false;
+  }
+  return true;
+}
+
+void GraphMemoryAssigner::PrintMemoryOffset() {
+  for (auto pair : memory_offset_) {
+    // Assign memory of max batch nodes that have the same batch label.
+    GELOGD("Reassign memory for max batch virtual nodes, memory type = %ld, memory offset = %zu.", pair.first,
+           pair.second.mem_offset_);
+  }
 }
 }  // namespace ge
diff --git a/src/ge/graph/build/memory/graph_mem_assigner.h b/src/ge/graph/build/memory/graph_mem_assigner.h
index 201e6d01..7bc929c0 100644
--- a/src/ge/graph/build/memory/graph_mem_assigner.h
+++ b/src/ge/graph/build/memory/graph_mem_assigner.h
@@ -37,7 +37,7 @@ struct MemoryOffset {
   size_t mem_offset_;
 };
 
-using MemoryOffsetList = vector<MemoryOffset>;
+using MemoryOffsetMap = std::map<int64_t, MemoryOffset>;
 
 class VariableMemoryAssigner {
  public:
@@ -97,9 +97,9 @@ class GraphMemoryAssigner {
   ///
   ge::Status AssignVarAttr2Nodes();
 
-  ge::Status ReAssignMemory(bool is_loop_graph, size_t &mem_offset);
+  ge::Status ReAssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_type_to_offset);
 
-  ge::Status AssignZeroCopyMemory(size_t &mem_offset, size_t &zero_mem_copy_size);
+  ge::Status AssignZeroCopyMemory(map<int64_t, size_t> &mem_offset, size_t &zero_mem_copy_size);
 
   ge::Status SetInputOffset();
 
@@ -139,7 +139,7 @@ class GraphMemoryAssigner {
                                               std::vector<NodePtr> &connecting_output_atomic_nodes);
 
   ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
-                                         int64_t &continuous_mem_size);
+                                         int64_t &continuous_mem_size, int64_t memory_type);
 
   ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node);
 
@@ -174,7 +174,7 @@ class GraphMemoryAssigner {
 
   ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node);
 
-  void AlignMemOffset(const int64_t &mem_align_size);
+  void AlignMemOffset(const int64_t &mem_align_size, int64_t memory_type);
 
   ge::Status UpdateOpInputOffset(const NodePtr &node, vector<int64_t> &input_list) const;
 
@@ -182,7 +182,14 @@ class GraphMemoryAssigner {
 
   NodePtr GetKnownInputNode(const NodePtr &node) const;
 
-  MemoryOffsetList memory_offset_;
+  ge::Status GetNodeMemoryType(const NodePtr &node, int64_t &memory_type, string input_or_output);
+  ge::Status GetNodeListMemoryType(const vector<NodePtr> &nodes, int32_t mem_reuse_model, int64_t &memory_type);
+
+  bool CheckContinuousMemType(vector<int64_t> mem_type_list);
+
+  void PrintMemoryOffset();
+
+  MemoryOffsetMap memory_offset_;
   ge::ComputeGraphPtr compute_graph_;
   HybridMemAssignerPtr mem_assigner_;
 };
diff --git a/src/ge/graph/build/memory/hybrid_mem_assigner.cc b/src/ge/graph/build/memory/hybrid_mem_assigner.cc
index a75487de..32246f7d 100644
--- a/src/ge/graph/build/memory/hybrid_mem_assigner.cc
+++ b/src/ge/graph/build/memory/hybrid_mem_assigner.cc
@@ -23,7 +23,7 @@
 
 namespace ge {
 HybridMemAssigner::HybridMemAssigner(ge::ComputeGraphPtr compute_graph)
-    : mem_offset_(0), compute_graph_(std::move(compute_graph)), priority_assigner_(nullptr) {}
+    : mem_offset_(0), p2p_mem_offset_(0), compute_graph_(std::move(compute_graph)), priority_assigner_(nullptr) {}
 
 Status HybridMemAssigner::AssignMemory(std::unique_ptr<BlockMemAssigner> &block_assigner, size_t &mem_size) {
   vector<int64_t> ranges;
@@ -73,6 +73,7 @@ Status HybridMemAssigner::Assign() {
 
   priority_assigner->SetOpMemOffset(false);
   mem_offset_ = priority_assigner->GetMemOffset();
+  p2p_mem_offset_ = priority_assigner->GetP2PMemOffset();
   priority_assigner_ = std::move(priority_assigner);
 
   return SUCCESS;
diff --git a/src/ge/graph/build/memory/hybrid_mem_assigner.h b/src/ge/graph/build/memory/hybrid_mem_assigner.h
index fba70a59..3913fea1 100644
--- a/src/ge/graph/build/memory/hybrid_mem_assigner.h
+++ b/src/ge/graph/build/memory/hybrid_mem_assigner.h
@@ -43,6 +43,7 @@ class HybridMemAssigner : public MemAssigner {
   Status Assign() override;
 
   size_t GetMemOffset() const { return mem_offset_; }
+  size_t GetP2PMemOffset() const { return p2p_mem_offset_; }
 
   BlockMemAssignerPtr GetPriorityAssinger() const { return priority_assigner_; }
 
@@ -50,6 +51,7 @@ class HybridMemAssigner : public MemAssigner {
   Status AssignMemory(std::unique_ptr<BlockMemAssigner> &block_assigner, size_t &mem_size);
 
   size_t mem_offset_;
+  size_t p2p_mem_offset_;
 
   ge::ComputeGraphPtr compute_graph_;
 
diff --git a/src/ge/graph/build/memory/memory_assigner.cc b/src/ge/graph/build/memory/memory_assigner.cc
index e36f082e..271d5633 100644
--- a/src/ge/graph/build/memory/memory_assigner.cc
+++ b/src/ge/graph/build/memory/memory_assigner.cc
@@ -20,7 +20,7 @@
 #include "graph/build/memory/graph_mem_assigner.h"
 
 namespace ge {
-Status MemoryAssigner::AssignMemory(bool is_loop_graph, size_t &mem_offset, size_t &zero_copy_mem_size) {
+Status MemoryAssigner::AssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_offset, size_t &zero_copy_mem_size) {
   GraphMemoryAssigner graph_mem_assigner(compute_graph_);
 
   if (graph_mem_assigner.AssignMemory() != ge::SUCCESS) {
diff --git a/src/ge/graph/build/memory/module.mk b/src/ge/graph/build/memory/module.mk
index 2b77e40e..47c9e5cc 100644
--- a/src/ge/graph/build/memory/module.mk
+++ b/src/ge/graph/build/memory/module.mk
@@ -17,6 +17,7 @@ local_lib_inc_path :=   ${LOCAL_PATH} \
                         ${TOPDIR}third_party/protobuf/include \
                         ${TOPDIR}inc/framework \
                         $(TOPDIR)framework/domi \
+                        $(TOPDIR)graphengine/ge \
 
 #compiler for host
 include $(CLEAR_VARS)
diff --git a/src/ge/graph/build/memory/var_mem_assign_util.h b/src/ge/graph/build/memory/var_mem_assign_util.h
index cb38af29..b34e3646 100644
--- a/src/ge/graph/build/memory/var_mem_assign_util.h
+++ b/src/ge/graph/build/memory/var_mem_assign_util.h
@@ -29,6 +29,7 @@ class VarMemAssignUtil {
   static Status AssignStaticMemory2Node(ge::ComputeGraphPtr &compute_graph);
   static Status AssignVarAttr2Nodes(ge::ComputeGraphPtr &compute_graph);
   static Status AssignMemory2HasRefAttrNode(ge::ComputeGraphPtr &compute_graph);
+  static Status AssignData2Fp32Var(const ge::NodePtr &node, uint64_t session_id);
 
  private:
   static Status AssignMemory2VariableNode(ge::ComputeGraphPtr &compute_graph);
@@ -40,7 +41,6 @@ class VarMemAssignUtil {
 
   static Status DealBroadCastNode(uint32_t graph_id, const ge::NodePtr &node, const ge::InDataAnchorPtr &in_data_anchor,
                                   const ge::NodePtr &var_node, uint64_t session_id);
-  static Status AssignData2Fp32Var(const ge::NodePtr &node, uint64_t session_id);
 
   static ge::NodePtr GetFinalTransNode(const ge::NodePtr &ref_node);
 
diff --git a/src/ge/graph/build/model_builder.cc b/src/ge/graph/build/model_builder.cc
index 9a37478d..6efc78fb 100644
--- a/src/ge/graph/build/model_builder.cc
+++ b/src/ge/graph/build/model_builder.cc
@@ -93,7 +93,6 @@ ModelBuilder::ModelBuilder(uint64_t session_id, ge::ComputeGraphPtr compute_grap
                            const Graph2SubGraphInfoList &subgraphs, const map<string, int> &stream_max_parallel_num,
                            bool hcom_parallel, int mode)
     : session_id_(session_id),
-      mem_offset_(0),
       weight_offset_(kWeightsStartOffset),
       compute_graph_(std::move(compute_graph)),
       subgraphs_(subgraphs),
@@ -104,6 +103,7 @@ ModelBuilder::ModelBuilder(uint64_t session_id, ge::ComputeGraphPtr compute_grap
       hcom_parallel_(hcom_parallel),
       build_mode_(mode),
       max_mem_offset_(0),
+      p2p_mem_offset_(0),
       zero_copy_mem_size_(0),
       platform_type_(0),
       is_loop_graph_(false),
@@ -145,7 +145,7 @@ Status ModelBuilder::CalcOutputSize(const ge::NodePtr &n) {
 
 bool ModelBuilder::SetInputConst(const OpDescPtr &op_desc, const NodePtr &src_node, size_t index,
                                  vector<bool> &is_input_const) {
-  GELOGI("SetIsInputConst const: %s", op_desc->GetName().c_str());
+  GELOGI("SetIsInputConst const: %s, source node: %s", op_desc->GetName().c_str(), src_node->GetName().c_str());
   for (size_t i = is_input_const.size(); i <= index; ++i) {
     is_input_const.push_back(false);
   }
@@ -153,7 +153,7 @@ bool ModelBuilder::SetInputConst(const OpDescPtr &op_desc, const NodePtr &src_no
 
   vector<GeTensorPtr> weights = OpDescUtils::MutableWeights(src_node);
   if (weights.empty()) {
-    GELOGW("SetInputIsConst weights is empty");
+    GELOGW("SetInputIsConst weights is empty, node: %s", src_node->GetName().c_str());
     return false;
   }
   GeTensorPtr weight = weights[0];
@@ -192,6 +192,7 @@ void ModelBuilder::SetInputIsConst(const ge::NodePtr &n) {
     GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue);
     const auto &src_node = peer_out_anchor->GetOwnerNode();
     if (!NodeUtils::GetConstOpType(src_node, const_type)) {
+      GELOGI("Node %s:%zu, sorce node: %s Not Const", n->GetName().c_str(), index, src_node->GetName().c_str());
       continue;
     }
 
@@ -385,10 +386,16 @@ void ModelBuilder::InitL1FusionOption() {
 Status ModelBuilder::BuildModelDef(ge::Model &model) {
   ClearOriginalFormat();
 
-  max_mem_offset_ = mem_offset_;
+  max_mem_offset_ = mem_type_to_mem_offset_[RT_MEMORY_HBM];
   GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_MEMORY_SIZE, max_mem_offset_),
                    GELOGE(FAILED, "SetInt of ATTR_MODEL_MEMORY_SIZE failed.");
                    return FAILED);
+  if (mem_type_to_mem_offset_.find(RT_MEMORY_P2P_DDR) != mem_type_to_mem_offset_.end()) {
+    p2p_mem_offset_ = mem_type_to_mem_offset_[RT_MEMORY_P2P_DDR];
+  }
+  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_P2P_MEMORY_SIZE, p2p_mem_offset_),
+                   GELOGE(FAILED, "SetInt of ATTR_MODEL_P2P_MEMORY_SIZE failed.");
+                   return FAILED);
   GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_WEIGHT_SIZE, weight_offset_),
                    GELOGE(FAILED, "SetInt of ATTR_MODEL_WEIGHT_SIZE failed.");
                    return FAILED);
@@ -410,7 +417,8 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) {
   GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, GetLocalOmgContext().net_out_nodes),
                    GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed.");
                    return FAILED);
-  GELOGI("For model, max_mem_offset_: %zu, zero_copy_mem_size_: %zu", max_mem_offset_, zero_copy_mem_size_);
+  GELOGI("For model, max_mem_offset_: %zu, p2p_mem_size: %zu, zero_copy_mem_size_: %zu", max_mem_offset_,
+         p2p_mem_offset_, zero_copy_mem_size_);
 
   string ge_core_type;
   Status ret = ge::GetContext().GetOption(kCoreType, ge_core_type);
@@ -713,7 +721,7 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) {
 
   GE_TIMESTAMP_START(AssignMemory);
   MemoryAssigner mem_assigner(compute_graph_);
-  GE_CHK_STATUS_RET(mem_assigner.AssignMemory(is_loop_graph_, mem_offset_, zero_copy_mem_size_),
+  GE_CHK_STATUS_RET(mem_assigner.AssignMemory(is_loop_graph_, mem_type_to_mem_offset_, zero_copy_mem_size_),
                     "Assign Memory Failed!");
   GE_TIMESTAMP_END(AssignMemory, "GraphBuilder::AssignMemory");
 
diff --git a/src/ge/graph/build/model_builder.h b/src/ge/graph/build/model_builder.h
index e54d6695..b2f58f6e 100644
--- a/src/ge/graph/build/model_builder.h
+++ b/src/ge/graph/build/model_builder.h
@@ -85,7 +85,7 @@ class ModelBuilder {
 
   uint64_t session_id_;
 
-  size_t mem_offset_;
+  map<int64_t, size_t> mem_type_to_mem_offset_;
 
   size_t weight_offset_;
 
@@ -106,6 +106,7 @@ class ModelBuilder {
 
   int build_mode_;
   size_t max_mem_offset_;
+  size_t p2p_mem_offset_;
   size_t zero_copy_mem_size_;
 
   TBEKernelStore tbe_kernel_store_;
diff --git a/src/ge/graph/build/run_context.cc b/src/ge/graph/build/run_context.cc
index cece31ea..35f68186 100644
--- a/src/ge/graph/build/run_context.cc
+++ b/src/ge/graph/build/run_context.cc
@@ -23,7 +23,9 @@
 namespace ge {
 RunContextUtil::~RunContextUtil() { DestroyRtModelResources(); }
 
-Status RunContextUtil::InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size, uint8_t *weight_mem_base,
+Status RunContextUtil::InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size,
+                                   std::map<int64_t, uint8_t *> mem_type_to_data_mem_base,
+                                   std::map<int64_t, uint64_t> mem_type_to_data_mem_size, uint8_t *weight_mem_base,
                                    uint64_t weight_mem_size) {
   if ((data_mem_size > 0) && (data_mem_base == nullptr)) {
     GELOGE(PARAM_INVALID, "InitMemInfo param data_mem_base is null but data_mem_size = %lu.", data_mem_size);
@@ -33,10 +35,20 @@ Status RunContextUtil::InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_siz
     GELOGE(PARAM_INVALID, "InitMemInfo param weight_mem_base is null but weight_mem_size = %lu.", weight_mem_size);
     return PARAM_INVALID;
   }
+  if (mem_type_to_data_mem_base.empty() || mem_type_to_data_mem_size.empty() ||
+      mem_type_to_data_mem_base.size() != mem_type_to_data_mem_size.size()) {
+    GELOGE(PARAM_INVALID,
+           "InitMemInfo param mem_type_to_data_mem_base size[%zu] is not equal to the size of "
+           "mem_type_to_data_mem_size[%zu].",
+           mem_type_to_data_mem_base.size(), mem_type_to_data_mem_size.size());
+    return PARAM_INVALID;
+  }
   data_mem_base_ = data_mem_base;
   data_mem_size_ = data_mem_size;
   weight_mem_base_ = weight_mem_base;
   weight_mem_size_ = weight_mem_size;
+  mem_type_to_data_mem_base_ = mem_type_to_data_mem_base;
+  mem_type_to_data_mem_size_ = mem_type_to_data_mem_size;
   return SUCCESS;
 }
 
@@ -167,10 +179,33 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra
   GELOGI("CreateRunContext: data_mem_base_ = %p, weight_mem_base_ = %p, memory_size = %lu, weight_size = %lu",
          data_mem_base_, weight_mem_base_, data_mem_size_, weight_mem_size_);
 
-  run_context_ = {rt_model_,        nullptr, session_id,   data_mem_size_, data_mem_base_, weight_mem_size_,
-                  weight_mem_base_, buffer,  stream_list_, event_list_,    label_list_};
+  PrintMemInfo();
+
+  run_context_ = {rt_model_,
+                  nullptr,
+                  session_id,
+                  data_mem_size_,
+                  data_mem_base_,
+                  mem_type_to_data_mem_size_,
+                  mem_type_to_data_mem_base_,
+                  weight_mem_size_,
+                  weight_mem_base_,
+                  buffer,
+                  stream_list_,
+                  event_list_,
+                  label_list_};
   return SUCCESS;
 }
 
+void RunContextUtil::PrintMemInfo() {
+  for (auto iter : mem_type_to_data_mem_base_) {
+    GELOGI("CreateRunContext: memory type = %ld, data memory base = %p", iter.first, iter.second);
+  }
+
+  for (auto iter : mem_type_to_data_mem_size_) {
+    GELOGI("CreateRunContext: memory type = %ld, data memory size = %lu", iter.first, iter.second);
+  }
+}
+
 RunContext &RunContextUtil::GetRunContext() { return run_context_; }
 }  // namespace ge
diff --git a/src/ge/graph/build/run_context.h b/src/ge/graph/build/run_context.h
index 5b24f343..c4c5f655 100644
--- a/src/ge/graph/build/run_context.h
+++ b/src/ge/graph/build/run_context.h
@@ -33,7 +33,9 @@ class RunContextUtil {
   virtual ~RunContextUtil();
 
   // Init mem info.
-  ge::Status InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size, uint8_t *weight_mem_base,
+  ge::Status InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size,
+                         std::map<int64_t, uint8_t *> mem_type_to_data_mem_base,
+                         std::map<int64_t, uint64_t> mem_type_to_data_mem_size, uint8_t *weight_mem_base,
                          uint64_t weight_mem_size);
 
   ge::Status CreateRunContext(Model &model_def, const ComputeGraphPtr &graph, Buffer &buffer,
@@ -41,6 +43,8 @@ class RunContextUtil {
 
   RunContext &GetRunContext();
 
+  void PrintMemInfo();
+
   RunContext run_context_;
 
  private:
@@ -61,6 +65,8 @@ class RunContextUtil {
   uint64_t data_mem_size_ = 0;
   uint8_t *weight_mem_base_ = nullptr;
   uint64_t weight_mem_size_ = 0;
+  std::map<int64_t, uint8_t *> mem_type_to_data_mem_base_;
+  std::map<int64_t, uint64_t> mem_type_to_data_mem_size_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_BUILD_RUN_CONTEXT_H_
diff --git a/src/ge/graph/build/task_generator.cc b/src/ge/graph/build/task_generator.cc
index 58a8bf7b..7d47b611 100644
--- a/src/ge/graph/build/task_generator.cc
+++ b/src/ge/graph/build/task_generator.cc
@@ -33,6 +33,7 @@
 #include "init/gelib.h"
 #include "graph/ge_local_context.h"
 #include "ge/ge_api_types.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 using domi::LogTimeStampDef;
 using domi::ModelTaskDef;
@@ -305,10 +306,10 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
       GELOGI("Node[name:%s, type:%s] does not need to generate task.", name.c_str(), type.c_str());
       continue;
     }
-    OpsKernelInfoStorePtr kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name);
+    auto kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name);
     if (kernel_info_store == nullptr) {
-      GELOGE(INTERNAL_ERROR, "No ops kernel store found. node:%s(%s), op_kernel_lib_name=%s.", name.c_str(),
-             type.c_str(), op_kernel_lib_name.c_str());
+      GELOGE(INTERNAL_ERROR, "No ops kernel store or ops kernel builder found. node:%s(%s), op_kernel_lib_name=%s.",
+             name.c_str(), type.c_str(), op_kernel_lib_name.c_str());
       return INTERNAL_ERROR;
     }
     GE_CHK_STATUS_RET(UpdateAnchorStatus(node), "Call UpdateAnchorStatus node:%s(%s) failed", name.c_str(),
@@ -327,7 +328,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
     GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task.", op_kernel_lib_name.c_str(),
            name.c_str(), type.c_str(), op_id, stream_id);
     GE_TIMESTAMP_RESTART(GenerateTask);
-    auto ret = kernel_info_store->GenerateTask(*node, run_context, task_def_list);
+    auto ret = OpsKernelBuilderManager::Instance().GenerateTask(*node, run_context, task_def_list);
     GE_TIMESTAMP_ADD(GenerateTask);
     if (ret != SUCCESS) {
       GELOGE(ret, "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task failed.",
@@ -404,7 +405,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
       size_t task_list_size_before = task_def_list.size();
       OpsKernelInfoStorePtr kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name);
       if (kernel_info_store == nullptr) {
-        GELOGE(INTERNAL_ERROR, "Fusion: No ops kernel store found. fusion_node:%s(%s), op_kernel_lib_name=%s.",
+        GELOGE(INTERNAL_ERROR,
+               "Fusion: No ops kernel store or ops kernel builder found. fusion_node:%s(%s), op_kernel_lib_name=%s.",
                fusion_node_name.c_str(), fusion_node_type.c_str(), op_kernel_lib_name.c_str());
         return INTERNAL_ERROR;
       }
@@ -428,7 +430,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
       run_context.stream = run_context.graphStreamList[stream_id];
       GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.",
              op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id);
-      ret = kernel_info_store->GenerateTask(*fusion_node, run_context, task_def_list);
+      ret = OpsKernelBuilderManager::Instance().GenerateTask(*fusion_node, run_context, task_def_list);
       if (ret != SUCCESS) {
         GELOGE(ret,
                "Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), "
diff --git a/src/ge/graph/common/ge_call_wrapper.h b/src/ge/graph/common/ge_call_wrapper.h
index 305c6c15..249f952e 100644
--- a/src/ge/graph/common/ge_call_wrapper.h
+++ b/src/ge/graph/common/ge_call_wrapper.h
@@ -18,33 +18,31 @@
 #define GE_GE_CALL_WRAPPER_H_
 #include "framework/common/debug/ge_log.h"
 
-/*lint --emacro((773),GE_TIMESTAMP_START)*/
-/*lint -esym(773,GE_TIMESTAMP_START)*/
-#define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestap()
+#define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestamp()
 
 #define GE_TIMESTAMP_END(stage, stage_name)                                          \
   do {                                                                               \
-    uint64_t endUsec_##stage = ge::GetCurrentTimestap();                             \
+    uint64_t endUsec_##stage = ge::GetCurrentTimestamp();                            \
     GELOGI("[GEPERFTRACE] The time cost of %s is [%lu] micro second.", (stage_name), \
            (endUsec_##stage - startUsec_##stage));                                   \
   } while (0);
 
 #define GE_TIMESTAMP_EVENT_END(stage, stage_name)                                     \
   do {                                                                                \
-    uint64_t endUsec_##stage = ge::GetCurrentTimestap();                              \
+    uint64_t endUsec_##stage = ge::GetCurrentTimestamp();                             \
     GEEVENT("[GEPERFTRACE] The time cost of %s is [%lu] micro second.", (stage_name), \
             (endUsec_##stage - startUsec_##stage));                                   \
   } while (0);
 
-#define GE_TIMESTAMP_CALLNUM_START(stage)                \
-  uint64_t startUsec_##stage = ge::GetCurrentTimestap(); \
-  uint64_t call_num_of##stage = 0;                       \
+#define GE_TIMESTAMP_CALLNUM_START(stage)                 \
+  uint64_t startUsec_##stage = ge::GetCurrentTimestamp(); \
+  uint64_t call_num_of##stage = 0;                        \
   uint64_t time_of##stage = 0
 
-#define GE_TIMESTAMP_RESTART(stage) (startUsec_##stage = ge::GetCurrentTimestap())
+#define GE_TIMESTAMP_RESTART(stage) (startUsec_##stage = ge::GetCurrentTimestamp())
 
-#define GE_TIMESTAMP_ADD(stage)                                   \
-  time_of##stage += ge::GetCurrentTimestap() - startUsec_##stage; \
+#define GE_TIMESTAMP_ADD(stage)                                    \
+  time_of##stage += ge::GetCurrentTimestamp() - startUsec_##stage; \
   call_num_of##stage++
 
 #define GE_TIMESTAMP_CALLNUM_END(stage, stage_name)                                                                \
diff --git a/src/ge/graph/label/label_maker_factory.h b/src/ge/graph/label/label_maker_factory.h
index 0a87ec66..6bfc1e33 100644
--- a/src/ge/graph/label/label_maker_factory.h
+++ b/src/ge/graph/label/label_maker_factory.h
@@ -56,13 +56,12 @@ class LabelMakerFactory {
       LabelMakerFactory::Instance().RegisterCreator(node_type, func);
     }
 
-    ~Registerar() {}
+    ~Registerar() = default;
   };
 
  private:
-  LabelMakerFactory() {}
-
-  ~LabelMakerFactory() {}
+  LabelMakerFactory() = default;
+  ~LabelMakerFactory() = default;
 
   // register creator, this function will call in the constructor
   void RegisterCreator(const std::string &node_type, const LabelCreatorFun func) {
diff --git a/src/ge/graph/load/new_model_manager/data_dumper.cc b/src/ge/graph/load/new_model_manager/data_dumper.cc
index c6283d92..10c26d87 100644
--- a/src/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/src/ge/graph/load/new_model_manager/data_dumper.cc
@@ -83,6 +83,14 @@ static uint64_t GetNowTime() {
 
   return ret;
 }
+
+static void ReplaceStringElem(std::string &str) {
+  for_each(str.begin(), str.end(), [](char &ch) {
+    if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) {
+      ch = '_';
+    }
+  });
+}
 }  // namespace
 
 static int32_t GetIrDataType(ge::DataType data_type) {
@@ -196,14 +204,17 @@ void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr
   op_desc_info.op_type = op->GetType();
   op_desc_info.task_id = task_id;
   op_desc_info.stream_id = stream_id;
-  for (size_t i = 0; i < op->GetInputsSize(); ++i) {
-    GeTensorDesc input_desc = op->GetInputDesc(i);
-    op_desc_info.input_format.emplace_back(input_desc.GetFormat());
-    op_desc_info.input_shape.emplace_back(input_desc.GetShape().GetDims());
-    op_desc_info.input_data_type.emplace_back(input_desc.GetDataType());
+  for (size_t i = 0; i < op->GetAllInputsSize(); ++i) {
+    GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i);
+    if (input_tensor_desc == nullptr) {
+      continue;
+    }
+    op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat());
+    op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims());
+    op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType());
     int64_t input_size = 0;
-    auto tensor_descs = op->GetAllInputsDesc();
-    if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(i), input_size) != SUCCESS) {
+
+    if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) {
       GELOGW("Get input size failed");
       return;
     }
@@ -211,13 +222,15 @@ void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr
     op_desc_info.input_size.emplace_back(input_size);
   }
   for (size_t j = 0; j < op->GetOutputsSize(); ++j) {
-    GeTensorDesc output_desc = op->GetOutputDesc(j);
-    op_desc_info.output_format.emplace_back(output_desc.GetFormat());
-    op_desc_info.output_shape.emplace_back(output_desc.GetShape().GetDims());
-    op_desc_info.output_data_type.emplace_back(output_desc.GetDataType());
+    GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j);
+    if (output_tensor_desc == nullptr) {
+      continue;
+    }
+    op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat());
+    op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims());
+    op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType());
     int64_t output_size = 0;
-    auto tensor_descs = op->GetAllOutputsDesc();
-    if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(j), output_size) != SUCCESS) {
+    if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) {
       GELOGW("Get input size failed");
       return;
     }
@@ -671,12 +684,32 @@ Status DataDumper::LoadDumpInfo() {
   op_mapping_info.set_flag(kAicpuLoadFlag);
   op_mapping_info.set_dump_step(dump_properties_.GetDumpStep());
   SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
-  GELOGI("Dump step is %s and dump path  is %s in load dump info", dump_properties_.GetDumpStep().c_str(),
-         dump_path.c_str());
+  GELOGI("Dump step is %s and dump path is %s dump model is %s in load dump info",
+         dump_properties_.GetDumpStep().c_str(), dump_path.c_str(), dump_list_key.c_str());
+  auto ret = BuildTaskInfo(op_mapping_info);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Build task info failed");
+    return ret;
+  }
 
+  SetEndGraphIdToAicpu(end_graph_task_id_, end_graph_stream_id_, op_mapping_info);
+
+  SetOpDebugIdToAicpu(op_debug_task_id_, op_debug_stream_id_, op_debug_addr_, op_mapping_info);
+
+  if (!op_list_.empty() || is_op_debug_ || is_end_graph_) {
+    auto ret = ExecuteLoadDumpInfo(op_mapping_info);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Execute load dump info failed");
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status DataDumper::BuildTaskInfo(aicpu::dump::OpMappingInfo &op_mapping_info) {
   for (const auto &op_iter : op_list_) {
     auto op_desc = op_iter.op;
-    GELOGD("Op %s in model %s begin to add task in op_mapping_info", op_desc->GetName().c_str(), dump_list_key.c_str());
+    GELOGD("Op %s in model begin to add task in op_mapping_info", op_desc->GetName().c_str());
     aicpu::dump::Task task;
     task.set_end_graph(false);
     task.set_task_id(op_iter.task_id);
@@ -695,12 +728,16 @@ Status DataDumper::LoadDumpInfo() {
     }
     if (dump_properties_.GetDumpMode() == kDumpInput) {
       if (op_iter.is_task) {
-        GE_CHK_STATUS_RET(DumpInput(op_iter, task), "Dump input failed");
+        Status ret = DumpInput(op_iter, task);
+        if (ret != SUCCESS) {
+          GELOGE(ret, "Dump input failed");
+          return ret;
+        }
       }
       op_mapping_info.mutable_task()->Add(std::move(task));
       continue;
     }
-    if (dump_properties_.GetDumpMode() == kDumpAll) {
+    if (dump_properties_.GetDumpMode() == kDumpAll || is_op_debug_) {
       auto ret = DumpOutput(op_iter, task);
       if (ret != SUCCESS) {
         GELOGE(ret, "Dump output failed when in dumping all");
@@ -717,18 +754,6 @@ Status DataDumper::LoadDumpInfo() {
       continue;
     }
   }
-
-  SetEndGraphIdToAicpu(end_graph_task_id_, end_graph_stream_id_, op_mapping_info);
-
-  SetOpDebugIdToAicpu(op_debug_task_id_, op_debug_stream_id_, op_debug_addr_, op_mapping_info);
-
-  if (!op_list_.empty() || is_op_debug_ || is_end_graph_) {
-    auto ret = ExecuteLoadDumpInfo(op_mapping_info);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Execute load dump info failed");
-      return ret;
-    }
-  }
   return SUCCESS;
 }
 
@@ -902,8 +927,14 @@ Status DataDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> exceptio
         dump_data.mutable_output()->Add(std::move(output));
       }
       uint64_t now_time = GetNowTime();
-      string dump_file_path = "./" + op_desc_info.op_type + "." + op_desc_info.op_name + "." +
-                              to_string(op_desc_info.task_id) + "." + to_string(now_time);
+      std::string op_name = op_desc_info.op_name;
+      std::string op_type = op_desc_info.op_type;
+      ReplaceStringElem(op_name);
+      ReplaceStringElem(op_type);
+      string dump_file_path =
+        "./" + op_type + "." + op_name + "." + to_string(op_desc_info.task_id) + "." + to_string(now_time);
+      GELOGI("The exception dump file path is %s", dump_file_path.c_str());
+
       uint64_t proto_size = dump_data.ByteSizeLong();
       unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
       bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
diff --git a/src/ge/graph/load/new_model_manager/data_dumper.h b/src/ge/graph/load/new_model_manager/data_dumper.h
index 30218416..c1a102ad 100644
--- a/src/ge/graph/load/new_model_manager/data_dumper.h
+++ b/src/ge/graph/load/new_model_manager/data_dumper.h
@@ -36,10 +36,10 @@
 namespace ge {
 class DataDumper {
  public:
-  DataDumper()
+  explicit DataDumper(const RuntimeParam &rsh)
       : model_name_(),
         model_id_(0),
-        runtime_param_(),
+        runtime_param_(rsh),
         dev_mem_load_(nullptr),
         dev_mem_unload_(nullptr),
         op_list_(),
@@ -58,8 +58,6 @@ class DataDumper {
 
   void SetModelId(uint32_t model_id) { model_id_ = model_id; }
 
-  void SetMemory(const RuntimeParam &runtime_param) { runtime_param_ = runtime_param; }
-
   void SetDeviceId(uint32_t device_id) { device_id_ = device_id; }
 
   void SetComputeGraph(const ComputeGraphPtr &compute_graph) { compute_graph_ = compute_graph; };
@@ -105,7 +103,7 @@ class DataDumper {
   std::string om_name_;
 
   uint32_t model_id_;
-  RuntimeParam runtime_param_;
+  const RuntimeParam &runtime_param_;
   void *dev_mem_load_;
   void *dev_mem_unload_;
 
@@ -134,6 +132,8 @@ class DataDumper {
 
   DumpProperties dump_properties_;
 
+  // Build task info of op mapping info
+  Status BuildTaskInfo(aicpu::dump::OpMappingInfo &op_mapping_info);
   Status DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task);
   Status DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Output &output, size_t i,
                        const std::string &node_name_index);
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc
index 50867782..e4512e4f 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/src/ge/graph/load/new_model_manager/davinci_model.cc
@@ -42,8 +42,8 @@
 #include "graph/ge_context.h"
 #include "graph/graph.h"
 #include "graph/load/new_model_manager/cpu_queue_schedule.h"
-#include "graph/load/new_model_manager/tbe_handle_store.h"
 #include "graph/load/new_model_manager/model_manager.h"
+#include "graph/load/new_model_manager/tbe_handle_store.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/manager/trans_var_data_utils.h"
@@ -107,6 +107,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener
       mem_base_(nullptr),
       is_inner_mem_base_(false),
       is_inner_weight_base_(false),
+      is_inner_p2p_mem_base_(false),
       data_inputer_(nullptr),
       load_begin_time_(0),
       load_end_time_(0),
@@ -129,6 +130,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener
       session_id_(0),
       device_id_(0),
       maxDumpOpNum_(0),
+      data_dumper_(runtime_param_),
       iterator_count_(0),
       is_l1_fusion_enable_(false),
       is_first_execute_(true) {
@@ -182,6 +184,8 @@ DavinciModel::~DavinciModel() {
 
       FreeFeatureMapMem();
 
+      FreeP2PMem();
+
       if (l1_fusion_addr_ != nullptr) {
         GE_CHK_RT(rtFree(l1_fusion_addr_));
       }
@@ -266,6 +270,7 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
   is_model_has_inited_ = true;
 
   std::size_t data_size = TotalMemSize();
+  std::size_t p2p_data_size = P2PMemInfos().at(RT_MEMORY_P2P_DDR).memory_size;
   const Buffer &weights = ge_model_->GetWeight();
   std::size_t weights_size = weights.GetSize();
   GE_CHECK_LE(weights_size, ALLOC_MEMORY_MAX_SIZE);
@@ -281,6 +286,7 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
   }
 
   mem_base_ = static_cast<uint8_t *>(dev_ptr);
+  p2p_mem_base_ = static_cast<uint8_t *>(dev_ptr);
   weights_mem_base_ = static_cast<uint8_t *>(dev_ptr);
   is_inner_mem_base_ = false;
   is_inner_weight_base_ = false;
@@ -293,13 +299,23 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
     }
     GEEVENT("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
             mem_base_, data_size);
-
     weights_mem_base_ = mem_base_;
 
     is_inner_mem_base_ = true;
     is_inner_weight_base_ = true;
   }
 
+  if (p2p_data_size != 0) {
+    p2p_mem_base_ = MallocP2PMem(p2p_data_size);
+    if (p2p_mem_base_ == nullptr) {
+      GELOGE(GE_EXEC_ALLOC_P2P_MEM_FAILED, "Alloc p2p memory failed,size: %zu", p2p_data_size);
+      return GE_EXEC_ALLOC_P2P_MEM_FAILED;
+    }
+    GELOGI("InitModelMem graph_%u MallocMemory type[P] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
+           p2p_mem_base_, p2p_data_size);
+    is_inner_p2p_mem_base_ = true;
+  }
+
   if (weights_size != 0) {
     weights_mem_base_ = static_cast<uint8_t *>(weight_ptr);
     is_inner_weight_base_ = false;
@@ -320,6 +336,7 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
   GE_CHK_STATUS_RET(InitVariableMem(), "Init variable memory failed.");
   runtime_param_.mem_base = mem_base_;
   runtime_param_.weight_base = weights_mem_base_;
+  runtime_param_.memory_infos[RT_MEMORY_P2P_DDR].memory_base = p2p_mem_base_;
   return SUCCESS;
 }
 
@@ -343,6 +360,7 @@ Status DavinciModel::InitVariableMem() {
 void DavinciModel::InitRuntimeParams() {
   int64_t value = 0;
   bool ret;
+  MemInfo p2p_mem_info;
   ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_MEMORY_SIZE, value);
   runtime_param_.mem_size = ret ? (uint64_t)value : 0;
   ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_WEIGHT_SIZE, value);
@@ -366,6 +384,9 @@ void DavinciModel::InitRuntimeParams() {
   ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_VAR_SIZE, value);
   runtime_param_.var_size = ret ? (uint64_t)value : 0;
   session_id_ = runtime_param_.session_id;
+  ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_P2P_MEMORY_SIZE, value);
+  p2p_mem_info.memory_size = ret ? (uint64_t)value : 0;
+  runtime_param_.memory_infos[RT_MEMORY_P2P_DDR] = std::move(p2p_mem_info);
 
   GELOGI(
     "InitRuntimeParams(), session_id:%lu, stream_num:%u, event_num:%u, label_num:%u, "
@@ -518,6 +539,7 @@ void DavinciModel::OpDebugUnRegister() {
     debug_reg_mutex_.unlock();
     rtError_t rt_ret = RT_ERROR_NONE;
     if (rt_model_handle_ != nullptr) {
+      GELOGD("start call debug_unregister.");
       rt_ret = rtDebugUnRegister(rt_model_handle_);
       if (rt_ret != RT_ERROR_NONE) {
         GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret);
@@ -602,11 +624,6 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   // create model_handle to load model
   GE_CHK_RT_RET(rtModelCreate(&rt_model_handle_, 0));
   GE_CHK_RT_RET(rtModelGetId(rt_model_handle_, &runtime_model_id_));
-  // malloc 2M for dump l1fusion op
-  GE_CHK_RT_RET(rtMalloc(&l1_fusion_addr_, kDumpL1FusionOpMByteSize, RT_MEMORY_DDR));
-
-  // send l1fusion dump addr to rts
-  GE_CHK_RT_RET(rtDumpAddrSet(rt_model_handle_, l1_fusion_addr_, kDumpL1FusionOpMByteSize, kDumpFlagOfL1Fusion));
 
   // inference will use default graph_id 0;
   runtime_param_.graph_id = compute_graph->GetGraphID();
@@ -656,6 +673,17 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   auto ret = DoTaskSink();
   GE_TIMESTAMP_END(DoTaskSink, "GraphLoader::DoTaskSink");
 
+  auto all_dump_model = GetDumpProperties().GetAllDumpModel();
+  bool findByOmName = all_dump_model.find(om_name_) != all_dump_model.end();
+  bool findByModelName = all_dump_model.find(name_) != all_dump_model.end();
+  if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || findByOmName || findByModelName) {
+    // malloc 2M for dump l1fusion op
+    GE_CHK_RT_RET(rtMalloc(&l1_fusion_addr_, kDumpL1FusionOpMByteSize, RT_MEMORY_DDR));
+
+    // send l1fusion dump addr to rts
+    GE_CHK_RT_RET(rtDumpAddrSet(rt_model_handle_, l1_fusion_addr_, kDumpL1FusionOpMByteSize, kDumpFlagOfL1Fusion));
+  }
+
   /// In zero copy model, if a aicpu operator is connected to the first or last layer, before model execution,
   /// the aicpu opertor needs to destroy history record, and update operator memory address.
   /// The model with specified aicpu operators is only marked here, and destruction is in ModelManager::ExecuteModel().
@@ -769,7 +797,6 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
 
   map<uint32_t, OpDescPtr> data_by_index;
   auto nodes = compute_graph->GetAllNodes();
-  const TBEKernelStore &tbekernel_store = ge_model_->GetTBEKernelStore();
   const CustAICPUKernelStore &aicpu_kernel_store = ge_model_->GetCustAICPUKernelStore();
   for (size_t i = 0; i < nodes.size(); i++) {
     auto node = nodes.at(i);
@@ -782,7 +809,6 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
     op_list_[op_desc->GetId()] = op_desc;
 
     GE_TIMESTAMP_RESTART(LoadTBEKernelBinToOpDesc);
-    tbekernel_store.LoadTBEKernelBinToOpDesc(op_desc);
     aicpu_kernel_store.LoadCustAICPUKernelBinToOpDesc(op_desc);
     GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc);
 
@@ -898,8 +924,8 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
   }
   uint32_t parent_index = 0;  // Ignore subgraph Data Node.
   if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
-    GELOGI("Skip subgraph Data node: %s.", op_desc->GetName().c_str());
-    return SUCCESS;
+    GELOGI("Init zero copy by subgraph Data node: %s.", op_desc->GetName().c_str());
+    return InitInputBatchLabel(node);
   }
 
   data_op_list_.push_back(op_desc);
@@ -1010,9 +1036,9 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) {
   ComputeGraphPtr owner_graph = node->GetOwnerComputeGraph();
   GE_CHECK_NOTNULL(owner_graph);
   if (owner_graph->GetParentGraph() != nullptr) {
-    GELOGI("Skip subgraph NetOutput node: %s.", op_desc->GetName().c_str());
+    GELOGI("Init zero copy by subgraph NetOutput node: %s.", op_desc->GetName().c_str());
     op_list_.erase(op_desc->GetId());
-    return SUCCESS;
+    return InitOutputBatchLabel(node);
   }
 
   output_op_list_.push_back(op_desc);
@@ -1058,7 +1084,7 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) {
     for (size_t i = 0; i < tensor_addrs.size(); ++i) {
       void *real_addr = tensor_addrs.at(i);
       DisableZeroCopy(real_addr);
-      real_virtual_addrs_.emplace_back(real_addr);
+      real_virtual_addrs_.insert(real_addr);
     }
     GELOGI("SetOutputOutsideAddr success.");
   }
@@ -1124,6 +1150,68 @@ Status DavinciModel::InitOutputZeroCopyNodes(const NodePtr &node) {
   return SUCCESS;
 }
 
+///
+/// @ingroup ge
+/// @brief input zero copy node Initialize.
+/// @param [in] NodePtr: Data Op.
+/// @return Status
+///
+Status DavinciModel::InitInputBatchLabel(const NodePtr &node) {
+  string batch_label;
+  if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) {
+    return SUCCESS;  // Not Multi-batch.
+  }
+
+  const auto &out_data_anchor = node->GetOutDataAnchor(kDataIndex);
+  GE_CHECK_NOTNULL(out_data_anchor);
+
+  for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    const auto &node = peer_in_data_anchor->GetOwnerNode();
+    const auto &op_desc = node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+
+    if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) {
+      zero_copy_op_id_batch_label_[op_desc->GetId()] = batch_label;
+      GELOGD("Init input zero copy nodes success, op name: %s, op id: %ld, batch label: %s", op_desc->GetName().c_str(),
+             op_desc->GetId(), batch_label.c_str());
+    }
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief output zero copy node Initialize for Case.
+/// @param [in] NodePtr: netoutput Op.
+/// @return Status
+///
+Status DavinciModel::InitOutputBatchLabel(const NodePtr &node) {
+  string batch_label;
+  if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) {
+    return SUCCESS;  // Not Multi-batch.
+  }
+
+  for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+    const auto &peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
+    if (peer_out_data_anchor == nullptr) {
+      continue;
+    }
+
+    const auto &peer_node = peer_out_data_anchor->GetOwnerNode();
+    const auto &op_desc = peer_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+
+    if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) {
+      zero_copy_op_id_batch_label_[op_desc->GetId()] = batch_label;
+      GELOGD("Init Output zero copy nodes success, op name: %s, op id: %ld, batch label: %s",
+             op_desc->GetName().c_str(), op_desc->GetId(), batch_label.c_str());
+    }
+  }
+
+  return SUCCESS;
+}
+
 /// @ingroup ge
 /// @brief LabelSet Op Initialize.
 /// @param [in] op_desc: LabelSet Op descriptor.
@@ -2677,12 +2765,17 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector<void *> &inputs, const vec
   }
   GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(), "DavinciModel::UpdateKnownZeroCopyAddr failed.");
 
-  uint32_t total_addr_size = total_io_addrs_.size() * sizeof(uint64_t);
-  GELOGI("DavinciModel::UpdateKnownNodeArgs device args %p, dst size %u, src size %u", args_, total_args_size_,
-         total_addr_size);
+  if (total_args_size_ == 0) {
+    GELOGW("DavinciModel::UpdateKnownNodeArgs device args %p, dst size %u, pass rtMemcpy.", args_, total_args_size_);
+  } else {
+    uint32_t total_addr_size = total_io_addrs_.size() * sizeof(uint64_t);
+    GELOGI("DavinciModel::UpdateKnownNodeArgs device args %p, dst size %u, src size %u", args_, total_args_size_,
+           total_addr_size);
 
-  Status rt_ret = rtMemcpy(args_, total_args_size_, total_io_addrs_.data(), total_addr_size, RT_MEMCPY_HOST_TO_DEVICE);
-  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;)
+    Status rt_ret =
+      rtMemcpy(args_, total_args_size_, total_io_addrs_.data(), total_addr_size, RT_MEMCPY_HOST_TO_DEVICE);
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;)
+  }
 
   GELOGI("DavinciModel::UpdateKnownNodeArgs success");
   return SUCCESS;
@@ -2727,6 +2820,11 @@ Status DavinciModel::MallocKnownArgs() {
     }
   }
   // malloc args memory
+  if (total_args_size_ == 0) {
+    GELOGW("DavinciModel::MallocKnownArgs total_args_size_ equals to zero.");
+    return SUCCESS;
+  }
+
   rtError_t rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
@@ -2775,19 +2873,15 @@ Status DavinciModel::DistributeTask() {
     auto op_index = std::max(model_task_def->task(task_index).kernel().context().op_index(),
                              model_task_def->task(task_index).kernel_ex().op_index());
     OpDescPtr op = GetOpByIndex(op_index);
-    if (op == nullptr) {
-      GELOGE(PARAM_INVALID, "Op index %u is null, op list size %zu.", op_index, op_list_.size());
-      return PARAM_INVALID;
-    }
+    GE_CHECK_NOTNULL(op);
 
     SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId());
     if (reinterpret_cast<void *>(task->GetDumpArgs()) != nullptr) {
       bool call_dump = GetDumpProperties().IsLayerNeedDump(name_, om_name_, op->GetName()) && task->CallSaveDumpInfo();
-      if (call_dump) {
+      if (call_dump || is_op_debug_reg_) {
         SaveDumpTask(task->GetTaskID(), task->GetStreamId(), op, task->GetDumpArgs());
       }
     }
-
     // get op_name by task_index
     if (task->GetCtx() != nullptr) {
       auto iter = op_name_map_.find(task_index);
@@ -2863,7 +2957,7 @@ void DavinciModel::SetCopyOnlyOutput() {
 /// @return None.
 ///
 void DavinciModel::DisableZeroCopy(const void *addr) {
-  if (find(real_virtual_addrs_.begin(), real_virtual_addrs_.end(), addr) == real_virtual_addrs_.end()) {
+  if (real_virtual_addrs_.find(addr) == real_virtual_addrs_.end()) {
     return;
   }
 
@@ -3197,7 +3291,8 @@ Status DavinciModel::InitConstant(const OpDescPtr &op_desc) {
 /// @return Status
 ///
 Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) {
-  TBEKernelPtr tbe_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
+  auto kernel = ge_model_->GetTBEKernelStore().FindKernel(op_desc->GetName());
+  auto tbe_kernel = (kernel != nullptr) ? kernel : op_desc->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
   if (tbe_kernel == nullptr) {
     GELOGE(INTERNAL_ERROR, "TBE: %s can't find tvm bin file!", op_desc->GetName().c_str());
     return INTERNAL_ERROR;
@@ -3222,6 +3317,8 @@ Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) {
         binary.magic = RT_DEV_BINARY_MAGIC_ELF_AICPU;
       } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF") {
         binary.magic = RT_DEV_BINARY_MAGIC_ELF;
+      } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") {
+        binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC;
       } else {
         GELOGE(PARAM_INVALID, "TBE: Invalid parameter magic number! json: %s", json_string.c_str());
         return PARAM_INVALID;
@@ -3548,6 +3645,19 @@ uint8_t *DavinciModel::MallocFeatureMapMem(size_t data_size) {
   return mem_base;
 }
 
+uint8_t *DavinciModel::MallocP2PMem(size_t p2p_data_size) {
+  uint8_t *p2p_mem_base = nullptr;
+  const string purpose("p2p memory, used for some op related to hcom");
+  if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
+    string p2p_memory_key = std::to_string(0) + "_p";
+    p2p_mem_base =
+      MemManager::Instance(RT_MEMORY_P2P_DDR)->MallocMemory(purpose, p2p_memory_key, p2p_data_size, GetDeviceId());
+  } else {
+    p2p_mem_base = MemManager::Instance(RT_MEMORY_P2P_DDR)->MallocMemory(purpose, p2p_data_size, GetDeviceId());
+  }
+  return p2p_mem_base;
+}
+
 uint8_t *DavinciModel::MallocWeightsMem(size_t weights_size) {
   uint8_t *weights_mem_base = nullptr;
   const string purpose("weights memory in inference network.");
@@ -3577,6 +3687,22 @@ void DavinciModel::FreeFeatureMapMem() {
   }
 }
 
+void DavinciModel::FreeP2PMem() {
+  if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
+    std::string p2p_memory_key = std::to_string(0) + "_p";
+    if (MemManager::Instance(RT_MEMORY_P2P_DDR)->GetMemoryAddr(p2p_memory_key) != nullptr) {
+      GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_P2P_DDR)->FreeMemory(p2p_memory_key, GetDeviceId()),
+                    "failed to free p2p memory");
+    }
+    p2p_mem_base_ = nullptr;
+  } else {
+    GE_IF_BOOL_EXEC(p2p_mem_base_ != nullptr && is_inner_mem_base_,
+                    GE_CHK_STATUS(MemManager::Instance(RT_MEMORY_P2P_DDR)->FreeMemory(p2p_mem_base_, GetDeviceId()),
+                                  "failed to free p2p memory");
+                    p2p_mem_base_ = nullptr);
+  }
+}
+
 void DavinciModel::FreeWeightsMem() {
   if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
     string memory_key = std::to_string(0) + "_w";
@@ -3624,7 +3750,6 @@ void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &compute_graph) {
   GELOGI("set data dumper args, name: %s, id: %u.", name_.c_str(), model_id_);
   data_dumper_.SetModelName(name_);
   data_dumper_.SetModelId(model_id_);
-  data_dumper_.SetMemory(runtime_param_);
   data_dumper_.SetOmName(om_name_);
   data_dumper_.SetComputeGraph(compute_graph);
   data_dumper_.SetRefInfo(saved_task_addrs_);
@@ -3695,11 +3820,14 @@ Status DavinciModel::GetComputeGraphInfo(const ComputeGraphPtr &graph, vector<Co
       compute_graph_info.op_name = op_desc->GetName();
       compute_graph_info.op_type = op_desc->GetType();
 
-      for (size_t i = 0; i < op_desc->GetInputsSize(); ++i) {
-        GeTensorDesc input_desc = op_desc->GetInputDesc(i);
-        compute_graph_info.input_format.emplace_back(input_desc.GetFormat());
-        compute_graph_info.input_shape.emplace_back(input_desc.GetShape().GetDims());
-        compute_graph_info.input_data_type.emplace_back(input_desc.GetDataType());
+      for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) {
+        GeTensorDescPtr input_desc = op_desc->MutableInputDesc(i);
+        if (input_desc == nullptr) {
+          continue;
+        }
+        compute_graph_info.input_format.emplace_back(input_desc->GetFormat());
+        compute_graph_info.input_shape.emplace_back(input_desc->GetShape().GetDims());
+        compute_graph_info.input_data_type.emplace_back(input_desc->GetDataType());
       }
 
       for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) {
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.h b/src/ge/graph/load/new_model_manager/davinci_model.h
index 438fe639..6e127b3c 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.h
+++ b/src/ge/graph/load/new_model_manager/davinci_model.h
@@ -189,6 +189,8 @@ class DavinciModel {
   // get total mem size
   size_t TotalMemSize() const { return runtime_param_.mem_size; }
 
+  const std::map<uint32_t, MemInfo> &P2PMemInfos() const { return runtime_param_.memory_infos; }
+
   // model name
   string Name() const { return name_; }
 
@@ -410,6 +412,8 @@ class DavinciModel {
 
   void DisableZeroCopy(const void *addr);
 
+  bool GetOpDugReg() const { return is_op_debug_reg_; }
+
   ///
   /// @ingroup ge
   /// @brief Save outside address of Data or NetOutput used info for ZeroCopy.
@@ -498,11 +502,6 @@ class DavinciModel {
   void SetDumpProperties(const DumpProperties &dump_properties) { data_dumper_.SetDumpProperties(dump_properties); }
   const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); }
 
-  void SetMemcpyOffsetAndAddr(map<int64_t, void *> &memcpy_4g_offset_addr) {
-    memcpy_4g_offset_addr_.insert(memcpy_4g_offset_addr.begin(), memcpy_4g_offset_addr.end());
-  }
-  const map<int64_t, void *> &GetMemcpyOffsetAndAddr() const { return memcpy_4g_offset_addr_; }
-
   bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
     return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info);
   }
@@ -514,8 +513,10 @@ class DavinciModel {
   uint8_t *var_mem_base_;
   // memory address of model
   uint8_t *mem_base_;
+  uint8_t *p2p_mem_base_;
   bool is_inner_mem_base_;
   bool is_inner_weight_base_;
+  bool is_inner_p2p_mem_base_;
   // input data manager
   DataInputer *data_inputer_;
 
@@ -597,10 +598,14 @@ class DavinciModel {
 
   uint8_t *MallocWeightsMem(size_t weights_size);
 
+  uint8_t *MallocP2PMem(size_t p2p_data_size);
+
   void FreeFeatureMapMem();
 
   void FreeWeightsMem();
 
+  void FreeP2PMem();
+
   void ReleaseTask();
 
   void UnbindTaskSinkStream();
@@ -663,6 +668,22 @@ class DavinciModel {
   ///
   Status InitOutputZeroCopyNodes(const NodePtr &node);
 
+  ///
+  /// @ingroup ge
+  /// @brief input zero copy node Initialize for Case.
+  /// @param [in] NodePtr: Data Op.
+  /// @return Status
+  ///
+  Status InitInputBatchLabel(const NodePtr &node);
+
+  ///
+  /// @ingroup ge
+  /// @brief output zero copy node Initialize for Case.
+  /// @param [in] NodePtr: netoutput Op.
+  /// @return Status
+  ///
+  Status InitOutputBatchLabel(const NodePtr &node);
+
   ///
   /// @ingroup ge
   /// @brief Constant Op Init.
@@ -845,7 +866,7 @@ class DavinciModel {
   std::map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
   std::map<const void *, ZeroCopyOffset> new_output_outside_addrs_;
 
-  std::vector<void *> real_virtual_addrs_;
+  std::set<const void *> real_virtual_addrs_;
 
   // output op: save cce op actual needed memory size
   vector<int64_t> output_memory_size_list_;
@@ -970,8 +991,6 @@ class DavinciModel {
   void *op_debug_addr_ = nullptr;
   void *p2p_debug_addr_ = nullptr;
   bool is_new_model_desc_{false};
-
-  std::map<int64_t, void *> memcpy_4g_offset_addr_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
diff --git a/src/ge/graph/load/new_model_manager/model_utils.cc b/src/ge/graph/load/new_model_manager/model_utils.cc
index 2bb111f3..a9877d89 100644
--- a/src/ge/graph/load/new_model_manager/model_utils.cc
+++ b/src/ge/graph/load/new_model_manager/model_utils.cc
@@ -46,10 +46,8 @@ namespace ge {
 vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
   vector<int64_t> v_input_size;
   GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size);
-  const size_t inputs_size = op_desc->GetAllInputsSize();
-  const string op_type = op_desc->GetType();
 
-  const vector<bool> v_is_input_const = op_desc->GetIsInputConst();
+  const size_t inputs_size = op_desc->GetAllInputsSize();
   for (size_t i = 0; i < inputs_size; ++i) {
     const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
     if (tensor_desc == nullptr) {
@@ -58,23 +56,12 @@ vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
     }
 
     int64_t tensor_size = 0;
-    if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) {
-      // TBE: add weights size to input
-      GE_CHK_STATUS(TensorUtils::GetSize(*tensor_desc, tensor_size));
-      if (tensor_size) {
-        v_input_size.push_back(tensor_size);
-      }
-      GELOGI("[IMAS]GetInputSize op: %s, index: %lu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
-      continue;
-    }
-
     GE_IF_BOOL_EXEC(
       TensorUtils::GetSize(*tensor_desc, tensor_size) != GRAPH_SUCCESS,
       GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
       continue);
 
-    GELOGI("[IMAS]GetInputSize op: %s, index: %lu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
-
+    GELOGI("[IMAS]GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
     v_input_size.push_back(tensor_size);
   }
 
@@ -109,6 +96,7 @@ vector<int64_t> ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) {
       GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i);
       continue);
 
+    GELOGI("[IMAS]GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
     v_output_size.push_back(tensor_size);
   }
 
@@ -314,7 +302,7 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
   GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_data_addr);
   uint64_t session_id = model_param.session_id;
 
-  const size_t inputs_size = op_desc->GetAllInputsSize();
+  const size_t inputs_size = op_desc->GetInputsSize();
   const vector<int64_t> v_input_offset = op_desc->GetInputOffset();
 
   const string op_type = op_desc->GetType();
@@ -330,10 +318,8 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
   }
   for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) {
     const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(static_cast<uint32_t>(i));
-    if (tensor_desc == nullptr) {
-      GELOGD("Op: %s, Index: %zu, has no input", op_desc->GetName().c_str(), i);
-      continue;
-    }
+    GE_IF_BOOL_EXEC(tensor_desc == nullptr, GELOGD("Op: %s, Index: %zu, has no input", op_desc->GetName().c_str(), i);
+                    continue;)
     if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) {
       // TBE: add weights address to input
       int64_t tensor_size = 0;
@@ -351,6 +337,16 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
       continue;
     }
 
+    int64_t mem_type;
+    bool tensor_has_mem_type = ge::AttrUtils::GetInt(tensor_desc, ATTR_NAME_TENSOR_MEM_TYPE, mem_type);
+    if (tensor_has_mem_type && v_memory_type[i] != RT_MEMORY_L1) {
+      uint8_t *p2p_mem_addr = model_param.memory_infos.at(RT_MEMORY_P2P_DDR).memory_base + v_input_offset[i];
+      v_input_data_addr.push_back(p2p_mem_addr);
+      GELOGI("[IMAS]GetInputDataAddrs graph_%u type[P] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
+             op_desc->GetName().c_str(), i, p2p_mem_addr);
+      continue;
+    }
+
     GE_IF_BOOL_EXEC(non_const_index >= v_input_offset.size(),
                     GELOGW("offsets=%zu, inputs=%zu, index=%zu.", v_input_offset.size(), inputs_size, non_const_index);
                     break);
@@ -366,11 +362,16 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
                     continue);
 
     // feature maps
-    uint8_t *mem_addr = nullptr;
-    //  fusion
-    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
+    void *mem_addr = nullptr;
+    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {  // fusion
       mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset));
       v_input_data_addr.push_back(mem_addr);
+    } else if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_TS_4G) {
+      int64_t tensor_size = 0;
+      GE_CHK_STATUS_EXEC(TensorUtils::GetSize(*tensor_desc, tensor_size), return {});
+      VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, input_offset);
+      mem_addr = model_param.ts_mem_mall->Acquire(input_offset, static_cast<uint64_t>(tensor_size));
+      v_input_data_addr.push_back(mem_addr);
     } else {
       VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, input_offset);
       mem_addr = model_param.mem_base + input_offset;
@@ -414,12 +415,33 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
                     GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]",
                            model_param.graph_id, op_desc->GetName().c_str(), i, variable_addr);
                     continue);
+    const GeTensorDescPtr tensor_desc = op_desc->MutableOutputDesc(i);
+    if (tensor_desc == nullptr) {
+      GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
+      continue;
+    }
+    int64_t mem_type;
+    bool tensor_has_mem_type = ge::AttrUtils::GetInt(tensor_desc, ATTR_NAME_TENSOR_MEM_TYPE, mem_type);
+    if (tensor_has_mem_type && v_memory_type[i] != RT_MEMORY_L1) {
+      uint8_t *p2p_mem_addr = model_param.memory_infos.at(RT_MEMORY_P2P_DDR).memory_base + v_output_offset[i];
+      v_output_data_addr.push_back(p2p_mem_addr);
+      GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[P] name[%s] output[%zu] memaddr[%p]", model_param.graph_id,
+             op_desc->GetName().c_str(), i, p2p_mem_addr);
+      continue;
+    }
     // feature maps
-    uint8_t *mem_addr = nullptr;
-    //  fusion
-    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
+    void *mem_addr = nullptr;
+    if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {  // fusion
       mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i]));
       v_output_data_addr.push_back(mem_addr);
+    } else if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_TS_4G) {
+      const GeTensorDescPtr tensor_desc = op_desc->MutableOutputDesc(i);
+      GE_CHECK_NOTNULL_EXEC(tensor_desc, return {});
+      int64_t tensor_size = 0;
+      GE_CHK_STATUS_EXEC(TensorUtils::GetSize(*tensor_desc, tensor_size), return {});
+      VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_output_offset[i]);
+      mem_addr = model_param.ts_mem_mall->Acquire(v_output_offset[i], static_cast<uint64_t>(tensor_size));
+      v_output_data_addr.push_back(mem_addr);
     } else {
       VALIDATE_MEM_RANGE(op_desc, model_param.mem_size, v_output_offset[i]);
       mem_addr = static_cast<uint8_t *>(model_param.mem_base + v_output_offset[i]);
@@ -447,9 +469,36 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
            v_workspace_bytes.size());
     return v_workspace_data_addr;
   }
+
+  vector<bool> workspace_reuse_flag;
+  bool has_workspace_reuse = ge::AttrUtils::GetListBool(op_desc, "workspace_reuse_flag", workspace_reuse_flag);
   vector<int64_t> v_memory_type;
+  vector<int64_t> workspace_memory_type;
   bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, v_memory_type);
+  bool has_mem_type_workspace =
+    ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_WORKSPACE_TYPE_LIST, workspace_memory_type);
   for (size_t i = 0; i < v_workspace_bytes.size(); ++i) {
+    // Temporary solution, the aicpu workspace of multiple images cannot be shared.
+    if (has_workspace_reuse && i < workspace_reuse_flag.size() && !workspace_reuse_flag[i]) {
+      void *mem_addr = model_param.aicpu_mem_mall->Acquire(v_workspace_offset[i], v_workspace_bytes[i]);
+      v_workspace_data_addr.push_back(mem_addr);
+      GELOGI(
+        "[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] aicpu workspace[%zu]  offset[%ld] bytes[%ld] "
+        "memaddr[%p]",
+        model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i], mem_addr);
+    } else if (has_mem_type_workspace && workspace_memory_type[i] == RT_MEMORY_P2P_DDR) {
+      int64_t p2p_workspace_offset = v_workspace_offset[i];
+      int64_t p2p_workspace_bytes = v_workspace_bytes[i];
+      uint8_t *p2p_mem_addr = p2p_workspace_bytes == 0
+                                ? nullptr
+                                : model_param.memory_infos.at(RT_MEMORY_P2P_DDR).memory_base + p2p_workspace_offset;
+      v_workspace_data_addr.push_back(p2p_mem_addr);
+      GELOGI(
+        "[IMAS]GetWorkspaceDataAddrs graph_%u type[P] name[%s] p2p workspace[%zu]  offset[%ld] bytes[%ld] "
+        "memaddr[%p]",
+        model_param.graph_id, op_desc->GetName().c_str(), i, p2p_workspace_offset, p2p_workspace_bytes, p2p_mem_addr);
+      continue;
+    }
     if (has_mem_type_attr && v_memory_type[i] == RT_MEMORY_L1) {
       v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
       GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[L1] name[%s], mem_addr[workspace index %zu]:0x%lx",
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
index 4f72ec36..b21a9dc5 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@@ -171,6 +171,10 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
       dump_flag_ = RT_KERNEL_DUMPFLAG;
       dump_args_ = input_output_addr_;
     }
+    if (davinci_model_->GetOpDugReg()) {
+      GELOGI("Op debug is open in kernel ex task info");
+      dump_args_ = input_output_addr_;
+    }
   }
 
   uint64_t input_output_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(input_output_addr_));
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
index 3964e0d5..3476751b 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -638,6 +638,9 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
     dump_args_ = static_cast<char *>(args_) + offset;
   }
 
+  GE_CHK_BOOL_TRUE_EXEC_INFO(davinci_model_->GetOpDugReg(), dump_args_ = static_cast<char *>(args_) + offset,
+                             "Op debug is open in TVM task info");
+
   Status ge_ret = UpdateL2Data(kernel_def);
   // update origin l2 data
   if (ge_ret != SUCCESS) {
@@ -936,6 +939,10 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     }
     dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
   }
+  if (davinci_model_->GetOpDugReg()) {
+    GELOGI("Op debug is open in aicpu task info");
+    dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
+  }
   if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
     dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
   }
diff --git a/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
index efefd3e2..bbbf313f 100644
--- a/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
@@ -94,8 +94,10 @@ Status LabelSwitchByIndexTaskInfo::Init(const domi::TaskDef &task_def, DavinciMo
     label_list_[idx] = label_list[label_id];
   }
 
+  rtMemType_t memory_type = op_desc->HasAttr(ATTR_NAME_MEMORY_TYPE_RANGE) ? RT_MEMORY_TS_4G : RT_MEMORY_HBM;
+  GELOGI("memory_type: %u", memory_type);
   args_size_ = branch_max_ * sizeof(rtLabelDevInfo);
-  rtError_t rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
+  rtError_t rt_ret = rtMalloc(&args_, args_size_, memory_type);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
     return RT_ERROR_TO_GE_STATUS(rt_ret);
diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
index 1f542154..a4d7fcc7 100644
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
@@ -26,10 +26,7 @@ const uint32_t kAlignBytes = 64;
 namespace ge {
 Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
   GELOGI("MemcpyAddrAsyncTaskInfo Init Start");
-  if (davinci_model == nullptr) {
-    GELOGE(PARAM_INVALID, "davinci_model is null");
-    return PARAM_INVALID;
-  }
+  GE_CHECK_NOTNULL(davinci_model);
 
   Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList());
   if (ret != SUCCESS) {
@@ -43,12 +40,13 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel
     return INTERNAL_ERROR;
   }
 
-  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.src(), src_);
+  const RuntimeParam &rts_param = davinci_model->GetRuntimeParam();
+  ret = ModelUtils::GetRtAddress(rts_param, memcpy_async.src(), src_);
   if (ret != SUCCESS) {
     return ret;
   }
 
-  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.dst(), dst_);
+  ret = ModelUtils::GetRtAddress(rts_param, memcpy_async.dst(), dst_);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -59,10 +57,7 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel
 
   // malloc args memory
   size_t args_size = sizeof(void *) * io_addrs.size();
-  rtMemType_t memory_type = RT_MEMORY_HBM;
-  if (op_desc->HasAttr(ATTR_NAME_MEMORY_TYPE_RANGE)) {
-    memory_type = RT_MEMORY_TS_4G;
-  }
+  rtMemType_t memory_type = op_desc->HasAttr(ATTR_NAME_MEMORY_TYPE_RANGE) ? RT_MEMORY_TS_4G : RT_MEMORY_HBM;
   GELOGI("memory_type: %u", memory_type);
   rtError_t rt_ret = rtMalloc(&args_, args_size + kAlignBytes, memory_type);
   if (rt_ret != RT_ERROR_NONE) {
diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
index 96247e7d..3bad3c67 100644
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
@@ -22,27 +22,25 @@
 namespace ge {
 Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
   GELOGI("MemcpyAsyncTaskInfo Init Start");
-  if (davinci_model == nullptr) {
-    GELOGE(PARAM_INVALID, "davinci_model is null");
-    return PARAM_INVALID;
-  }
+  GE_CHECK_NOTNULL(davinci_model);
+  davinci_model_ = davinci_model;
 
-  Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList());
+  Status ret = SetStream(task_def.stream_id(), davinci_model_->GetStreamList());
   if (ret != SUCCESS) {
     return ret;
   }
 
-  memcpy_async = task_def.memcpy_async();
-  count_ = memcpy_async.count();
-  kind_ = memcpy_async.kind();
-  dst_max_ = memcpy_async.dst_max();
-  OpDescPtr op_desc = davinci_model->GetOpByIndex(memcpy_async.op_index());
+  memcpy_async_ = task_def.memcpy_async();
+  count_ = memcpy_async_.count();
+  kind_ = memcpy_async_.kind();
+  dst_max_ = memcpy_async_.dst_max();
+  OpDescPtr op_desc = davinci_model_->GetOpByIndex(memcpy_async_.op_index());
   if (op_desc == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async.op_index());
+    GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async_.op_index());
     return INTERNAL_ERROR;
   }
 
-  if (davinci_model->IsKnownNode()) {
+  if (davinci_model_->IsKnownNode()) {
     src_ = reinterpret_cast<uint8_t *>(davinci_model_->GetCurrentArgsAddr(args_offset_));
     dst_ = reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(src_) + sizeof(void *));
     // for zero copy
@@ -50,29 +48,34 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
     GELOGI("MemcpyAsyncTaskInfo src_ %p, dst_ %p, args_offset %u.", src_, dst_, args_offset_);
     return SUCCESS;
   }
-  ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.src(), src_);
+
+  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
+  ret = ModelUtils::GetRtAddress(rts_param, memcpy_async_.src(), src_);
   if (ret != SUCCESS) {
     return ret;
   }
 
   // dst_ needs different address for different chips
-  if (op_desc->HasAttr(ATTR_NAME_MEMORY_TYPE_RANGE)) {
-    ret = AllocTsMemoryForMemcpy(op_desc, davinci_model);
-    if (ret != SUCCESS) {
-      return ret;
+  vector<int64_t> memory_type_list;
+  (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memory_type_list);
+  if (!memory_type_list.empty() && memory_type_list[0] == RT_MEMORY_TS_4G) {  // TS Feature, Just one.
+    uint64_t mem_offset = memcpy_async_.dst() - rts_param.logic_mem_base;
+    dst_ = static_cast<uint8_t *>(rts_param.ts_mem_mall->Acquire(mem_offset, memcpy_async_.dst_max()));
+    if (dst_ == nullptr) {
+      return FAILED;
     }
   } else {
-    ret = ModelUtils::GetRtAddress(davinci_model->GetRuntimeParam(), memcpy_async.dst(), dst_);
+    ret = ModelUtils::GetRtAddress(rts_param, memcpy_async_.dst(), dst_);
     if (ret != SUCCESS) {
       return ret;
     }
   }
 
   GELOGI("MemcpyAsyncTaskInfo Init Success, logic[0x%lx, 0x%lx], src:%p, dst:%p, max:%lu, count:%lu",
-         memcpy_async.src(), memcpy_async.dst(), src_, dst_, dst_max_, count_);
+         memcpy_async_.src(), memcpy_async_.dst(), src_, dst_, dst_max_, count_);
 
-  davinci_model->DisableZeroCopy(src_);
-  davinci_model->DisableZeroCopy(dst_);
+  davinci_model_->DisableZeroCopy(src_);
+  davinci_model_->DisableZeroCopy(dst_);
   return SUCCESS;
 }
 
@@ -102,12 +105,12 @@ Status MemcpyAsyncTaskInfo::CalculateArgs(const domi::TaskDef &task_def, Davinci
 Status MemcpyAsyncTaskInfo::UpdateArgs() {
   GELOGI("MemcpyAsyncTaskInfo::UpdateArgs in.");
   GE_CHECK_NOTNULL(davinci_model_);
-  Status ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async.src(), src_);
+  Status ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async_.src(), src_);
   if (ret != SUCCESS) {
     return ret;
   }
 
-  ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async.dst(), dst_);
+  ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async_.dst(), dst_);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -122,33 +125,5 @@ Status MemcpyAsyncTaskInfo::UpdateArgs() {
   return SUCCESS;
 }
 
-Status MemcpyAsyncTaskInfo::AllocTsMemoryForMemcpy(const OpDescPtr &op_desc, DavinciModel *davinci_model) {
-  int64_t size = 0;
-  auto tensor_desc = op_desc->GetOutputDescPtr(0);
-  if ((tensor_desc == nullptr) || (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS)) {
-    GELOGE(FAILED, "GetTensorSizeInBytes failed!");
-    return FAILED;
-  }
-
-  rtError_t rt_ret = rtMalloc(&memory_4g_, size, RT_MEMORY_TS_4G);
-  if (rt_ret != RT_ERROR_NONE) {
-    GELOGE(RT_FAILED, "rtMalloc failed, ret: 0x%X", rt_ret);
-    return FAILED;
-  }
-
-  // map save the opdesc's offset and special address, for update the streamSwitchN's input address
-  std::map<int64_t, void *> memcpy_4g_offset_addr;
-  vector<int64_t> offsets = op_desc->GetOutputOffset();
-  if (offsets.empty()) {
-    GELOGE(FAILED, "GetOutputOffset failed!");
-    return FAILED;
-  }
-  memcpy_4g_offset_addr.insert(std::pair<int64_t, void *>(offsets[0], memory_4g_));
-  davinci_model->SetMemcpyOffsetAndAddr(memcpy_4g_offset_addr);
-
-  dst_ = reinterpret_cast<uint8_t *>(memory_4g_);
-  return SUCCESS;
-}
-
 REGISTER_TASK_INFO(RT_MODEL_TASK_MEMCPY_ASYNC, MemcpyAsyncTaskInfo);
 }  // namespace ge
diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
index 9436529d..9fe1ce24 100644
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
@@ -23,19 +23,11 @@
 namespace ge {
 class MemcpyAsyncTaskInfo : public TaskInfo {
  public:
-  MemcpyAsyncTaskInfo() : dst_(nullptr), dst_max_(0), src_(nullptr), count_(0), kind_(0), memory_4g_(nullptr) {}
+  MemcpyAsyncTaskInfo() : dst_(nullptr), dst_max_(0), src_(nullptr), count_(0), kind_(RT_MEMCPY_RESERVED) {}
 
   ~MemcpyAsyncTaskInfo() override {
     src_ = nullptr;
     dst_ = nullptr;
-
-    if (memory_4g_ != nullptr) {
-      rtError_t ret = rtFree(memory_4g_);
-      if (ret != RT_ERROR_NONE) {
-        GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", ret);
-      }
-      memory_4g_ = nullptr;
-    }
   }
 
   Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
@@ -47,7 +39,6 @@ class MemcpyAsyncTaskInfo : public TaskInfo {
   Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
 
  private:
-  Status AllocTsMemoryForMemcpy(const OpDescPtr &op_desc, DavinciModel *davinci_model);
   uint8_t *dst_;
   uint64_t dst_max_;
   uint8_t *src_;
@@ -55,8 +46,7 @@ class MemcpyAsyncTaskInfo : public TaskInfo {
   uint32_t kind_;
   DavinciModel *davinci_model_ = nullptr;
   uint32_t args_offset_ = 0;
-  domi::MemcpyAsyncDef memcpy_async;
-  void *memory_4g_;
+  domi::MemcpyAsyncDef memcpy_async_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ASYNC_TASK_INFO_H_
diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc
index d95aefac..dd5f8082 100644
--- a/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc
@@ -148,38 +148,37 @@ Status StreamSwitchNTaskInfo::CalculateArgs(const domi::TaskDef &task_def, Davin
   int64_t tensor_size = 0;
   GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
   davinci_model->SetTotalFixedAddrsSize(input_tensor_name, tensor_size);
-  GELOGI("Calculate stream switchn task args , tensor_size %ld, args_offset %ld", tensor_size, args_offset_);
+  GELOGI("Calculate stream switchn task args, tensor_size %ld, args_offset %ld", tensor_size, args_offset_);
   return SUCCESS;
 }
 
 Status StreamSwitchNTaskInfo::InputPtrUpdate(const OpDescPtr &op_desc, DavinciModel *davinci_model) {
-  bool is_4g_mem = false;
-  const map<int64_t, void *> memcpy_4g_offset_addr = davinci_model->GetMemcpyOffsetAndAddr();
-  vector<int64_t> input_offset = op_desc->GetInputOffset();
-  if (input_offset.empty()) {
-    GELOGE(FAILED, "Get StreamSwitchN's input offset failed.");
-    return FAILED;
-  }
-
-  auto iter = memcpy_4g_offset_addr.find(input_offset[0]);
-  if (iter != memcpy_4g_offset_addr.end()) {
-    input_ptr_ = iter->second;
-    is_4g_mem = true;
-  }
-
-  if (is_4g_mem == false) {
+  // dst_ needs different address for different chips
+  vector<int64_t> memory_type_list;
+  (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, memory_type_list);
+  if (!memory_type_list.empty() && memory_type_list[0] == RT_MEMORY_TS_4G) {  // TS Feature, Just one.
+    const vector<int64_t> input_offset = op_desc->GetInputOffset();
+    const vector<int64_t> input_legnth = ModelUtils::GetInputSize(op_desc);
+    if (input_offset.empty() || input_legnth.empty()) {
+      GELOGE(FAILED, "input offset size %zu, input legnth size: %zu", input_offset.size(), input_legnth.size());
+      return FAILED;
+    }
+    const RuntimeParam &rts_param = davinci_model->GetRuntimeParam();
+    input_ptr_ = rts_param.ts_mem_mall->Acquire(input_offset[0], input_legnth[0]);
+  } else {
     if (davinci_model->IsKnownNode()) {
       input_ptr_ = davinci_model->GetCurrentFixedAddr(args_offset_);
     } else {
       auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
       if (input_data_addr.empty()) {
+        GELOGE(FAILED, "input data addr is empty");
         return FAILED;
       }
       input_ptr_ = input_data_addr[0];
     }
   }
 
-  GELOGI("StreamSwitchN's input_ptr is %p, is_4g_mem: %d", input_ptr_, is_4g_mem);
+  GELOGI("StreamSwitchN's input_ptr is %p", input_ptr_);
   return SUCCESS;
 }
 REGISTER_TASK_INFO(RT_MODEL_TASK_STREAM_SWITCH_N, StreamSwitchNTaskInfo);
diff --git a/src/ge/graph/load/new_model_manager/task_info/task_info.h b/src/ge/graph/load/new_model_manager/task_info/task_info.h
index f69511e6..df35e093 100644
--- a/src/ge/graph/load/new_model_manager/task_info/task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/task_info.h
@@ -22,10 +22,24 @@
 #include "cce/customize.h"
 #include "cce/taskdown_common.hpp"
 #include "framework/common/ge_inner_error_codes.h"
+#include "graph/load/new_model_manager/ts_mem_mall.h"
 #include "graph/load/new_model_manager/task_info/task_info_factory.h"
 #include "proto/task.pb.h"
+
 namespace ge {
+struct MemInfo {
+  uint64_t memory_size = 0;
+  uint64_t logic_memory_base = 0;
+  uint8_t *memory_base = nullptr;
+};
+
 struct RuntimeParam {
+  RuntimeParam() {
+    ts_mem_mall = std::unique_ptr<TsMemMall>(new (std::nothrow) TsMemMall());
+    aicpu_mem_mall = std::unique_ptr<TsMemMall>(new (std::nothrow) TsMemMall(RT_MEMORY_HBM));
+  }
+  ~RuntimeParam() = default;
+
   uint64_t mem_size = 0;
   uint64_t logic_mem_base = 0;
   uint8_t *mem_base = nullptr;
@@ -35,12 +49,16 @@ struct RuntimeParam {
   uint64_t var_size = 0;
   uint64_t logic_var_base = 0;
   uint8_t *var_base = nullptr;
+  std::map<uint32_t, MemInfo> memory_infos;
   uint32_t batch_num = 0;
   uint32_t stream_num = 0;
   uint32_t event_num = 0;
   uint32_t label_num = 0;
   uint64_t session_id = 0;
   uint32_t graph_id = 0;
+
+  std::unique_ptr<TsMemMall> ts_mem_mall;
+  std::unique_ptr<TsMemMall> aicpu_mem_mall;
 };
 
 typedef struct FusionOpInfo {
diff --git a/src/ge/graph/load/new_model_manager/ts_mem_mall.h b/src/ge/graph/load/new_model_manager/ts_mem_mall.h
new file mode 100644
index 00000000..ed178e0b
--- /dev/null
+++ b/src/ge/graph/load/new_model_manager/ts_mem_mall.h
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_LOAD_TS_MEM_MALL_H_
+#define GE_GRAPH_LOAD_TS_MEM_MALL_H_
+
+#include <mutex>
+#include <unordered_map>
+#include <memory>
+
+#include "runtime/base.h"
+#include "framework/common/debug/ge_log.h"
+
+namespace {
+constexpr uint32_t kMaxTsMemBlock = 2 * 1024 * 1024;  // Max block 2M
+constexpr uint32_t kTsMemAligment = 64;               // Malloc for 64 bits align
+constexpr uint32_t kTsMemAlignMask = kTsMemAligment - 1;
+}  // namespace
+
+namespace ge {
+class TsMemMall {
+ public:
+  TsMemMall() { mem_type_ = RT_MEMORY_TS_4G; }
+  TsMemMall(rtMemType_t type) { mem_type_ = type; }
+  ~TsMemMall() {
+    for (auto it : mem_store_size_) {
+      rtError_t ret = rtFree(it.second);
+      if (ret != RT_ERROR_NONE) {
+        GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret);
+      }
+    }
+    mem_store_size_.clear();
+    mem_store_addr_.clear();
+  }
+
+  void *Acquire(int64_t offset, uint64_t size) {
+    if (size == 0) {
+      GELOGE(RT_FAILED, "Acquire mem block failed, size: %lu", size);
+      return nullptr;
+    }
+
+    uint64_t bytes = (size + kTsMemAlignMask) & ~kTsMemAlignMask;
+    if (bytes > kMaxTsMemBlock) {
+      GELOGW("Acquire TS memory may not physical continuity, size: %lu", bytes);
+    }
+
+    std::lock_guard<std::mutex> lock(mem_mutex_);
+    const auto it = mem_store_size_.find(offset);
+    if (it != mem_store_size_.end()) {
+      GELOGI("Acquire TS memory: %p, offset: %ld, size: %lu, align: %lu", it->second, offset, size, bytes);
+      return it->second;
+    }
+
+    void *addr = nullptr;
+    rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
+      return nullptr;
+    }
+
+    GELOGI("Acquire TS memory: %p, offset: %ld, size: %lu, align: %lu", addr, offset, size, bytes);
+    mem_store_size_[offset] = addr;
+    mem_store_addr_[addr] = offset;
+    return addr;
+  }
+
+  void Release(void *addr) {
+    std::lock_guard<std::mutex> lock(mem_mutex_);
+    const auto it = mem_store_addr_.find(addr);
+    if (it == mem_store_addr_.end()) {
+      GELOGW("Not TS memory: %p.", addr);
+      return;
+    }
+
+    GELOGI("Release TS memory: %p.", addr);
+    mem_store_size_.erase(it->second);
+    mem_store_addr_.erase(it);
+    rtError_t ret = rtFree(addr);
+    if (ret != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret);
+    }
+  }
+
+ private:
+  std::mutex mem_mutex_;
+  std::unordered_map<int64_t, void *> mem_store_size_;
+  std::unordered_map<void *, int64_t> mem_store_addr_;
+  rtMemType_t mem_type_;
+};
+}  // namespace ge
+#endif  // GE_GRAPH_LOAD_TS_MEM_MALL_H_
diff --git a/src/ge/graph/load/new_model_manager/zero_copy_offset.cc b/src/ge/graph/load/new_model_manager/zero_copy_offset.cc
index 18b958ef..910c1a72 100644
--- a/src/ge/graph/load/new_model_manager/zero_copy_offset.cc
+++ b/src/ge/graph/load/new_model_manager/zero_copy_offset.cc
@@ -141,7 +141,7 @@ void ZeroCopyOffset::IsL2Fusion(const vector<int64_t> &fusion_basic_addrs, const
 }
 
 void ZeroCopyOffset::SetInputOutsideAddrs(const vector<int64_t> &output_offset_list, void *addr, const size_t &index,
-                                          bool fusion_flag, std::vector<void *> &real_virtual_addrs) {
+                                          bool fusion_flag, std::set<const void *> &real_virtual_addrs) {
   GELOGI("[ZCPY] Start to SetInputOutsideAddrs for virtual_addr %p.", addr);
   uint32_t out_count = 0;
   if (!fusion_flag) {
@@ -150,7 +150,7 @@ void ZeroCopyOffset::SetInputOutsideAddrs(const vector<int64_t> &output_offset_l
     std::map<const void *, std::vector<void *>> addr_mapping;
     addr_mapping[addr] = {};
     outside_addrs_.emplace_back(addr_mapping);
-    real_virtual_addrs.emplace_back(addr);
+    real_virtual_addrs.insert(addr);
   } else {
     GELOGI("[ZCPY] set l2-fusion for virtual_addr %p.", addr);
     int64_t output_offset = output_offset_list.at(index);
@@ -162,7 +162,7 @@ void ZeroCopyOffset::SetInputOutsideAddrs(const vector<int64_t> &output_offset_l
         std::map<const void *, std::vector<void *>> addr_mapping;
         addr_mapping[virtual_addr] = {};
         outside_addrs_.emplace_back(addr_mapping);
-        real_virtual_addrs.emplace_back(virtual_addr);
+        real_virtual_addrs.insert(virtual_addr);
         GELOGI("[ZCPY] virtual_addr %p has been fusion to virtual_addr %p.", addr, virtual_addr);
       }
     }
diff --git a/src/ge/graph/load/new_model_manager/zero_copy_offset.h b/src/ge/graph/load/new_model_manager/zero_copy_offset.h
index eb2cdb4d..8749d937 100644
--- a/src/ge/graph/load/new_model_manager/zero_copy_offset.h
+++ b/src/ge/graph/load/new_model_manager/zero_copy_offset.h
@@ -45,7 +45,7 @@ class ZeroCopyOffset {
   Status InitInputDataInfo(const vector<int64_t> &output_size_list, const vector<void *> &virtual_addr_list,
                            const OpDescPtr &op_desc, bool &fusion_flag);
   void SetInputOutsideAddrs(const vector<int64_t> &output_offset_list, void *addr, const size_t &index,
-                            bool fusion_flag, std::vector<void *> &real_virtual_addrs);
+                            bool fusion_flag, std::set<const void *> &real_virtual_addrs);
 
   void IsL2Fusion(const vector<int64_t> &fusion_basic_addrs, const int64_t &tensor_addr, bool &fusion_flag);
   Status InitOutputDataInfo(const vector<int64_t> &input_size_list, const vector<void *> &virtual_addr_list,
diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc
index 39bdee36..ee5acd1f 100644
--- a/src/ge/graph/manager/graph_manager.cc
+++ b/src/ge/graph/manager/graph_manager.cc
@@ -45,6 +45,7 @@
 #include "graph/manager/util/rt_context_util.h"
 #include "graph/partition/dynamic_shape_partition.h"
 #include "graph/passes/enter_pass.h"
+#include "graph/partition/stage_partition.h"
 #include "graph/passes/addn_pass.h"
 #include "graph/passes/bitcast_pass.h"
 #include "graph/passes/atomic_addr_clean_pass.h"
@@ -95,6 +96,7 @@
 #include "graph/passes/variable_ref_useless_control_out_delete_pass.h"
 #include "graph/passes/end_of_sequence_add_control_pass.h"
 #include "graph/passes/subexpression_migration_pass.h"
+#include "graph/passes/subgraph_const_migration_pass.h"
 #include "graph/passes/unused_args_clean_pass.h"
 #include "graph/passes/global_step_insert_pass.h"
 #include "graph/utils/tensor_adapter.h"
@@ -131,10 +133,7 @@ bool IsTailingOptimization() {
 }  // namespace
 
 namespace ge {
-GraphManager::GraphManager(OmgContext &omg_context)
-    : thread_run_flag_(false), graph_run_listener_(nullptr), init_flag_(false), omg_context_(omg_context) {
-  SetLocalOmgContext(omg_context);
-}
+GraphManager::GraphManager() : thread_run_flag_(false), graph_run_listener_(nullptr), init_flag_(false) {}
 
 Status GraphManager::Initialize(const std::map<string, string> &options) {
   if (init_flag_) {
@@ -162,14 +161,6 @@ Status GraphManager::Initialize(const std::map<string, string> &options) {
     return ret;
   }
 
-  graph_builder_.SetOptions(options_);
-  ret = graph_optimize_.SetOptions(options_);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "[Initialize] Graph optimize initialize failed.");
-    return ret;
-  }
-  graph_preparer_.SetOptions(options_);
-
   ret = graph_context_->Initialize(options);
   if (ret != SUCCESS) {
     GELOGE(ret, "[Initialize] GraphContext initialize failed.");
@@ -240,6 +231,13 @@ Status GraphManager::Finalize() {
         continue;
       }
     }
+
+    // clear analyzer saved info(graph level)
+    auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph());
+    GE_CHECK_NOTNULL(compute_graph);
+    auto session_id = compute_graph->GetSessionID();
+    auto graph_id = compute_graph->GetGraphID();
+    Analyzer::GetInstance()->DestroyGraphJsonObject(session_id, graph_id);
   }
   graph_map_.clear();
   cache_helper_map_.clear();
@@ -258,8 +256,8 @@ Status GraphManager::Finalize() {
 }
 
 Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph,
-                              const std::map<std::string, std::string> &options) {
-  if (graph_map_.find(graph_id) != graph_map_.end()) {
+                              const std::map<std::string, std::string> &options, const OmgContext &omg_context) {
+  if (HasGraphNode(graph_id)) {
     GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] graph exists, graph_id = %u.", graph_id);
     return GE_GRAPH_GRAPH_ALREADY_EXIST;
   }
@@ -304,19 +302,34 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph,
 
   graph_node->SetGraph(graph_ptr);
   graph_node->SetOptions(options);
+  AddGraphNode(graph_id, graph_node);
 
-  graph_map_.insert(std::make_pair(graph_id, graph_node));
+  AddLocalOmgContext(graph_id, omg_context);
+  if (!options_.output_datatype.empty()) {
+    GetLocalOmgContext().output_type = options_.output_datatype;
+  }
 
-  GELOGI("[GraphManager] add graph success, graph_id = %u.", graph_id);
+  CompilerStages &stages = GetCompilerStages(graph_id);
+  stages.preparer.SetOptions(options_);
+  Status status = stages.optimizer.SetOptions(options_);
+  if (status != SUCCESS) {
+    GELOGE(status, "Graph optimizer set options failed.");
+    return status;
+  }
+  stages.builder.SetOptions(options_);
 
   var_acc_ctrl_.AddGraph(graph_id, compute_graph);
+
+  GELOGI("[GraphManager] add graph success, graph_id = %u.", graph_id);
   return SUCCESS;
 }
 
-Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph) {
+Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph,
+                                   GraphId root_graph_id) {
   std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  GraphPartitioner &partitioner = GetCompilerStages(root_graph_id).partitioner;
   if (instance_ptr != nullptr && instance_ptr->InitFlag()) {
-    Status ret = graph_partitioner_.MergeAfterSubGraphOptimization(compute_graph, original_compute_graph);
+    Status ret = partitioner.MergeAfterSubGraphOptimization(compute_graph, original_compute_graph);
     if (ret != SUCCESS) {
       GELOGE(ret, "merge end and placeholder after subGraph optimization failed.");
       return FAILED;
@@ -328,7 +341,7 @@ Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::Com
       return ret_topo;
     }
   } else {
-    auto subgraph_list = graph_partitioner_.GetSubGraphMap();
+    auto subgraph_list = partitioner.GetSubGraphMap();
     if (subgraph_list.find(original_compute_graph) != subgraph_list.end() &&
         !subgraph_list[original_compute_graph].empty() && subgraph_list[original_compute_graph][0] != nullptr) {
       compute_graph = subgraph_list[original_compute_graph][0]->GetSubGraph();
@@ -388,8 +401,8 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
     if (!op_compile_strategy.empty()) {
       (void)AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
     }
-    std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id,
-                                            GetThreadLocalContext());
+    std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this,
+                                            compute_graph->GetGraphID(), subgraph, session_id, GetThreadLocalContext());
     if (!f.valid()) {
       GELOGE(FAILED, "Future is invalid");
       return FAILED;
@@ -403,8 +416,9 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
       if (!op_compile_strategy.empty()) {
         (void)AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
       }
-      std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id,
-                                              GetThreadLocalContext());
+      std::future<Status> f =
+        executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, compute_graph->GetGraphID(), subgraph,
+                        session_id, GetThreadLocalContext());
       if (!f.valid()) {
         GELOGE(FAILED, "Future is invalid");
         return FAILED;
@@ -486,9 +500,9 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_
   return SUCCESS;
 }
 
-Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph) {
+Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph, GraphPartitioner &partitioner) {
   GE_CHECK_NOTNULL(compute_graph);
-  auto sub_graph_map = graph_partitioner_.GetSubGraphMap();
+  auto sub_graph_map = partitioner.GetSubGraphMap();
   std::string buffer_optimize;
   graphStatus graph_status = ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
   bool need_lx_fusion = (graph_status == GRAPH_SUCCESS) && (buffer_optimize != kOffOptimize);
@@ -558,29 +572,29 @@ Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node,
                                                  ge::ComputeGraphPtr &compute_graph, uint64_t session_id) {
   GE_CHECK_NOTNULL(graph_node);
   GE_CHECK_NOTNULL(compute_graph);
-  GM_RUN_AND_DUMP_PERF("OptimizeGraphPrepare", graph_optimize_.OptimizeOriginalGraphForQuantize, compute_graph);
-  GM_RUN_AND_DUMP_PERF("HandleSummaryOp", graph_optimize_.HandleSummaryOp, compute_graph);
-  GM_RUN_AND_DUMP_PERF("Prepare", graph_preparer_.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
+
+  CompilerStages &stages = GetCompilerStages(graph_node->GetGraphId());
+  GM_RUN_AND_DUMP_PERF("OptimizeGraphPrepare", stages.optimizer.OptimizeOriginalGraphForQuantize, compute_graph);
+  GM_RUN_AND_DUMP_PERF("HandleSummaryOp", stages.optimizer.HandleSummaryOp, compute_graph);
+  GM_RUN_AND_DUMP_PERF("Prepare", stages.preparer.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
                        session_id);
-  GM_RUN_AND_DUMP_PERF("OptimizeOriginalGraph", graph_optimize_.OptimizeOriginalGraph, compute_graph);
+  GM_RUN_AND_DUMP_PERF("OptimizeOriginalGraph", stages.optimizer.OptimizeOriginalGraph, compute_graph);
 
-  GM_RUN_AND_DUMP_PERF("PrepareRunningFormatRefiner", graph_preparer_.PrepareRunningFormatRefiner);
-  GM_RUN_AND_DUMP_PERF("RefineRunningFormat", graph_optimize_.OptimizeOriginalGraphJudgeInsert, compute_graph);
+  GM_RUN_AND_DUMP_PERF("PrepareRunningFormatRefiner", stages.preparer.PrepareRunningFormatRefiner);
+  GM_RUN_AND_DUMP_PERF("RefineRunningFormat", stages.optimizer.OptimizeOriginalGraphJudgeInsert, compute_graph);
   GM_RUN_AND_DUMP_PERF("SubexpressionMigration", SubexpressionMigration, compute_graph);
-  GE_RUN(GraphManager, graph_preparer_.RecordAIPPInfo, compute_graph);
+  GE_RUN(GraphManager, stages.preparer.RecordAIPPInfo, compute_graph);
   if (IsTailingOptimization()) {
-    GM_RUN_AND_DUMP_PERF("OptimizeSwitchOp", graph_preparer_.SwitchOpOptimize, compute_graph);
+    GM_RUN_AND_DUMP_PERF("OptimizeSwitchOp", stages.preparer.SwitchOpOptimize, compute_graph);
   }
   GM_RUN_AND_DUMP_PERF("Optimize1", OptimizeStage1, compute_graph);
   GM_RUN_AND_DUMP_PERF("InferShape2", compute_graph->InferShapeInNeed);
-  const char *unknown_shape_skip = std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION");
-  if (unknown_shape_skip != nullptr) {
-    PassManager graph_pass;
-    GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass))
-    GE_CHK_STATUS_RET(graph_pass.Run(compute_graph));
-  }
 
-  GE_CHK_STATUS_RET(graph_optimize_.IdentifyReference(compute_graph), "Identify reference failed.");
+  PassManager graph_pass;
+  GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass))
+  GE_CHK_STATUS_RET(graph_pass.Run(compute_graph));
+
+  GE_CHK_STATUS_RET(stages.optimizer.IdentifyReference(compute_graph), "Identify reference failed.");
   GELOGI("PreRun:PreRunOptimizeOriginalGraph success.");
   return SUCCESS;
 }
@@ -607,7 +621,9 @@ Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node,
   GE_CHECK_NOTNULL(graph_node);
   GE_CHECK_NOTNULL(compute_graph);
   GM_RUN_AND_DUMP_PERF("Optimize2", OptimizeStage2, compute_graph);
-  GM_RUN_AND_DUMP_PERF("OptimizeGraphBeforeBuildForRts", graph_optimize_.OptimizeGraphBeforeBuildForRts, compute_graph);
+  GM_RUN_AND_DUMP_PERF("OptimizeGraphBeforeBuildForRts",
+                       GetCompilerStages(graph_node->GetGraphId()).optimizer.OptimizeGraphBeforeBuildForRts,
+                       compute_graph);
   GM_RUN_AND_DUMP_PERF("Build", Build, graph_node, compute_graph, ge_root_model, session_id);
   GELOGI("PreRun:PreRunAfterOptimizeSubGraph success.");
   return SUCCESS;
@@ -711,7 +727,7 @@ Status GraphManager::SubexpressionMigration(ComputeGraphPtr &compute_graph) {
 
   GE_TIMESTAMP_START(SubexpressionMigrationPass);
   auto ret = pass_manager.Run(compute_graph);
-  GE_TIMESTAMP_END(SubexpressionMigrationPass, "GraphManager::OptimizeStage1_1");
+  GE_TIMESTAMP_END(SubexpressionMigrationPass, "GraphManager::SubexpressionMigration");
   if (ret != SUCCESS && ret != NOT_CHANGED) {
     GELOGE(ret, "Run SubexpressionMigrationPass failed, ret:%u.", ret);
     return ret;
@@ -858,6 +874,7 @@ Status GraphManager::SaveCacheAfterBuild(uint32_t graph_id, ge::ComputeGraphPtr
   }
 
   if (instance_ptr->IsIncreBuild()) {
+    std::lock_guard<std::mutex> lock(member_mutex_);
     auto iter = cache_helper_map_.find(graph_id);
     if (iter == cache_helper_map_.end()) {
       GELOGW("Can not find ModelCacheHelper of graph[%u]", graph_id);
@@ -936,6 +953,9 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector<GeTenso
     GELOGE(GE_GRAPH_ALREADY_RUNNING, "[RunGraph] graph already running, graph id = %u", graph_id);
     return GE_GRAPH_ALREADY_RUNNING;
   }
+
+  UpdateLocalOmgContext(graph_id);
+
   // set graph's run flag
   graph_node->SetRunFlag(true);
   ComputeGraphPtr compute_graph_tmp = GraphUtils::GetComputeGraph(*(graph_node->GetGraph()));
@@ -950,7 +970,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector<GeTenso
   AddModelCacheHelperToMap(graph_id, session_id, compute_graph_tmp);
 
   if (options_.local_fmk_op_flag) {
-    graph_optimize_.TranFrameOp(compute_graph_tmp);
+    GetCompilerStages(graph_id).optimizer.TranFrameOp(compute_graph_tmp);
   }
 
   GeRootModelPtr ge_root_model = nullptr;
@@ -1007,7 +1027,9 @@ Status GraphManager::GenerateInfershapeGraph(GraphId &graph_id) {
     return GE_GRAPH_GRAPH_NODE_NULL;
   }
 
-  ret = graph_preparer_.GenerateInfershapeGraph(graph_node->GetGraph());
+  UpdateLocalOmgContext(graph_id);
+
+  ret = GetCompilerStages(graph_id).preparer.GenerateInfershapeGraph(graph_node->GetGraph());
   if (ret != SUCCESS) {
     GELOGE(ret, "ATC dump infershape json failed");
     return ret;
@@ -1031,11 +1053,14 @@ Status GraphManager::BuildGraphForUnregisteredOp(const GraphId &graph_id, const
     GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[BuildGraph] graph node is NULL, graphId = %u.", graph_id);
     return GE_GRAPH_GRAPH_NODE_NULL;
   }
+
+  UpdateLocalOmgContext(graph_id);
+
   auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph());
   GE_CHECK_NOTNULL(compute_graph);
 
-  GM_RUN_AND_DUMP_PERF("Prepare", graph_preparer_.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
-                       session_id);
+  GM_RUN_AND_DUMP_PERF("Prepare", GetCompilerStages(graph_id).preparer.PrepareDynShape, graph_node->GetGraph(), inputs,
+                       compute_graph, session_id);
 
   for (auto &node : compute_graph->GetAllNodes()) {
     OpDescPtr op_desc = node->GetOpDesc();
@@ -1093,6 +1118,9 @@ Status GraphManager::BuildGraph(const GraphId &graph_id, const std::vector<GeTen
     GELOGE(GE_GRAPH_ALREADY_RUNNING, "[BuildGraph] graph already running, graph id = %u", graph_node->GetGraphId());
     return GE_GRAPH_ALREADY_RUNNING;
   }
+
+  UpdateLocalOmgContext(graph_id);
+
   graph_node->SetAsync(async);
   // set graph's run flag
   graph_node->SetRunFlag(true);
@@ -1137,6 +1165,7 @@ Status GraphManager::SaveParams(ge::GeModel &model, const std::string &type, con
 }
 
 void GraphManager::RemoveModelCacheHelper(const GraphId &graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
   auto iter = cache_helper_map_.find(graph_id);
   if (iter != cache_helper_map_.end()) {
     cache_helper_map_.erase(iter);
@@ -1150,18 +1179,20 @@ bool GraphManager::CheckModelLoad(const GeRootModelPtr &ge_root_model, bool load
 }
 
 Status GraphManager::RemoveGraph(const GraphId &graph_id) {
-  auto it = graph_map_.find(graph_id);
-  if (it == graph_map_.end()) {
+  GraphNodePtr graph_node = nullptr;
+  Status ret = GetGraphNode(graph_id, graph_node);
+  if (ret != SUCCESS) {
     GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[GraphManager] Id %u does not exists.", graph_id);
     return GE_GRAPH_GRAPH_NOT_EXIST;
   }
 
-  GraphNodePtr graph_node = it->second;
   if ((graph_node == nullptr) || (graph_node->GetRunFlag())) {
     GELOGE(GE_GRAPH_GRAPH_IS_RUNNING, "[GraphManager] Id %u is running, can't be deleted.", graph_id);
     return GE_GRAPH_GRAPH_IS_RUNNING;
   }
-  Status ret = SUCCESS;
+
+  std::lock_guard<std::mutex> lock(unload_model_mutex_);
+
   Status middle_ret;
   rtError_t rt_ret;
   const std::vector<SubGraphInfoPtr> &all_sub_graph = graph_node->GetAllSubGraph();
@@ -1197,7 +1228,7 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) {
     }
   }
   var_acc_ctrl_.RemoveGraph(graph_id);
-  graph_map_.erase(it);
+  RemoveGraphNode(graph_id);
 
   RemoveModelCacheHelper(graph_id);
 
@@ -1223,6 +1254,9 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) {
       ret = FAILED;
     }
   }
+
+  RemoveCompilerStages(graph_id);
+
   GE_CHK_STATUS_RET(ret, "[GraphManager:] Remove graph failed, graph_id=%u.", graph_id);
   GELOGI("[GraphManager] remove graph success, graph_id=%u.", graph_id);
   return SUCCESS;
@@ -1346,9 +1380,6 @@ Status GraphManager::ParseOptions(const std::map<std::string, std::string> &opti
 
   // net output node dataType
   ParseOption(options, OUTPUT_DATATYPE, options_.output_datatype);
-  if (!options_.output_datatype.empty()) {
-    omg_context_.output_type = options_.output_datatype;
-  }
 
   // Set save_original_model flag (ge.save_original_model)
   ParseOption(options, SAVE_ORIGINAL_MODEL, options_.save_original_model);
@@ -1544,7 +1575,23 @@ Status GraphManager::ParseParallelNum(const std::string &parallel_num, const std
   return SUCCESS;
 }
 
+void GraphManager::AddGraphNode(GraphId graph_id, const GraphNodePtr &graph_node) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  graph_map_.emplace(graph_id, graph_node);
+}
+
+void GraphManager::RemoveGraphNode(GraphId graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  graph_map_.erase(graph_id);
+}
+
+bool GraphManager::HasGraphNode(GraphId graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  return graph_map_.find(graph_id) != graph_map_.end();
+}
+
 Status GraphManager::GetGraphNode(const GraphId &graph_id, GraphNodePtr &out) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
   auto iter = graph_map_.find(graph_id);
   if (iter == graph_map_.end()) {
     out = nullptr;
@@ -1566,7 +1613,7 @@ Status GraphManager::SummaryHandle(const GraphId &graph_id, std::vector<GeTensor
   std::set<int> summary_output_index;
   GELOGI("[GraphManager] SummaryHandle, outputsSize=%zu.", outputs.size());
   const std::map<uint32_t, std::map<string, size_t>> &whole_summary_output_indexes =
-    graph_optimize_.GetSummaryOutputIndexes();
+    GetCompilerStages(graph_id).optimizer.GetSummaryOutputIndexes();
   if (whole_summary_output_indexes.find(graph_id) == whole_summary_output_indexes.end()) {
     GELOGE(FAILED, "No Summary graph found in map.");
     return FAILED;
@@ -1661,6 +1708,7 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap
 
 Status GraphManager::RegisterCallBackFunc(
   const std::string &key, const std::function<Status(uint32_t, const std::map<std::string, ge::Tensor> &)> &callback) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
   GELOGI("[GraphManager] RegisterCallBackFunc, key=%s.", key.c_str());
   me_callback_map_[key] = callback;
   return SUCCESS;
@@ -1668,6 +1716,7 @@ Status GraphManager::RegisterCallBackFunc(
 
 Status GraphManager::PushSummaryData2ME(const GraphId &graph_id,
                                         const std::map<std::string, ge::Tensor> &summary_data) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
   GELOGI("[GraphManager] PushSummaryData2ME, dataSize=%zu.", summary_data.size());
   auto itr = me_callback_map_.find(kSummary);
   if (itr == me_callback_map_.end()) {
@@ -1678,6 +1727,7 @@ Status GraphManager::PushSummaryData2ME(const GraphId &graph_id,
 }
 
 Status GraphManager::PushSaveData2ME(const GraphId &graph_id, const std::map<std::string, ge::Tensor> &save_data) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
   GELOGI("[GraphManager] PushSaveData2ME, dataSize=%zu.", save_data.size());
   auto itr = me_callback_map_.find(kSave);
   if (itr == me_callback_map_.end()) {
@@ -1938,7 +1988,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
     return ret;
   }
 
-  GraphUtils::DumpGEGraphToOnnx(*compute_graph, "OptimizeStage1_1");
+  GE_DUMP(compute_graph, "OptimizeStage1_1");
 
   NamesToPass names_to_passes;
   TransOpNearbyAllreduceFusionPass trans_op_nearby_allreduce_fusion_pass;
@@ -1988,9 +2038,11 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
            it.second.second, it.second.first);
   }
 
-  GraphUtils::DumpGEGraphToOnnx(*compute_graph, "OptimizeStage1_2");
+  GE_DUMP(compute_graph, "OptimizeStage1_2");
   PassManager graph_pass;
   // the prune pass should between SwitchPass and SwitchToStreamSwitchPass
+  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::Migration", new (std::nothrow) SubgraphConstMigrationPass));
+  GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::ArgsClean", new (std::nothrow) UnusedArgsCleanPass));
   GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::PrunePass", new (std::nothrow) PrunePass))
   GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::NextIterationPass", new (std::nothrow) NextIterationPass))
   GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::ControlTriggerPass", new (std::nothrow) ControlTriggerPass))
@@ -2121,7 +2173,7 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {
   }
 
   // After while sub graph handle, mark all node rw type
-  auto result = graph_optimize_.HandleMemoryRWConflict(compute_graph);
+  auto result = GetCompilerStages(compute_graph->GetGraphID()).optimizer.HandleMemoryRWConflict(compute_graph);
   if (result != SUCCESS) {
     GELOGW(
       "Mark node rw type failed. It will take some effect on memory_assign_conflicts handling."
@@ -2212,8 +2264,16 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
   if (free_memory >= (memory_size + weight_size)) {
     return SUCCESS;
   }
-  rtError_t rt_ret;
-  for (auto &it : graph_map_) {
+
+  std::lock_guard<std::mutex> lock(unload_model_mutex_);
+
+  std::map<GraphId, GraphNodePtr> graph_map;
+  {
+    std::lock_guard<std::mutex> lock(member_mutex_);
+    graph_map = graph_map_;
+  }
+
+  for (auto &it : graph_map) {
     auto graph_id = it.second->GetGraphId();
     auto model = it.second->GetGeRootModel();
     if (model == nullptr) {
@@ -2232,7 +2292,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
     }
     GELOGI("CheckAndReleaseMemory try to UnloadGraph[%u], model[%u] which MaxUsedMemory[%lu].", graph_id, model_id,
            max_memory_size);
-    rt_ret = rtSetDevice(GetContext().DeviceId());
+    rtError_t rt_ret = rtSetDevice(GetContext().DeviceId());
     if (rt_ret != RT_ERROR_NONE) {
       GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", model_id, graph_id);
       continue;
@@ -2254,16 +2314,18 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
     it.second->SetLoadFlag(false);
     GELOGI("CheckAndReleaseMemory UnloadGraph[%u], model[%u] success and set LoadFlag to false.", graph_id, model_id);
   }
+
   return SUCCESS;
 }
 
-Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager,
+Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, GraphId root_graph_id,
                                                      const SubGraphInfoPtr &sub_graph_info_ptr, uint64_t session_id,
                                                      const GEThreadLocalContext &ge_context) {
-  Status ret = SUCCESS;
-  GetThreadLocalContext() = ge_context;
   if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) {
-    SetLocalOmgContext(graph_manager->omg_context_);
+    GetContext().SetSessionId(session_id);
+    GetThreadLocalContext() = ge_context;
+    graph_manager->UpdateLocalOmgContext(root_graph_id);
+
     ComputeGraphPtr compute_graph_tmp = sub_graph_info_ptr->GetSubGraph();
     const std::string &engine_name = sub_graph_info_ptr->GetEngineName();
     GELOGI("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu",
@@ -2272,7 +2334,8 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
     GE_DUMP(compute_graph_tmp, "OptimizeSubGraphBefore");
     GE_CHECK_NOTNULL(compute_graph_tmp);
     compute_graph_tmp->SetSessionID(session_id);
-    ret = graph_manager->graph_optimize_.OptimizeSubGraph(compute_graph_tmp, engine_name);
+    Status ret =
+      graph_manager->GetCompilerStages(root_graph_id).optimizer.OptimizeSubGraph(compute_graph_tmp, engine_name);
     if (ret != SUCCESS) {
       GELOGE(ret, "SubGraph optimize Failed %s", engine_name.c_str());
       return ret;
@@ -2285,9 +2348,10 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
            compute_graph_tmp != nullptr ? compute_graph_tmp->GetName().c_str() : "", engine_name.c_str(),
            pthread_self());
   } else {
-    GELOGE(ret, "graph_manager or sub_graph_info_ptr is nullptr");
+    GELOGE(FAILED, "graph_manager or sub_graph_info_ptr is nullptr");
     return FAILED;
   }
+
   return SUCCESS;
 }
 
@@ -2310,6 +2374,7 @@ void GraphManager::AddModelCacheHelperToMap(const GraphId &graph_id, uint64_t se
                                             ComputeGraphPtr &compute_graph) {
   std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
   if (instance_ptr != nullptr && instance_ptr->IsIncreBuild()) {
+    std::lock_guard<std::mutex> lock(member_mutex_);
     auto iter = cache_helper_map_.find(graph_id);
     if (iter == cache_helper_map_.end()) {
       ModelCacheHelperPtr cache_helper = MakeShared<ge::ModelCacheHelper>(session_id, graph_id, compute_graph);
@@ -2322,18 +2387,27 @@ void GraphManager::AddModelCacheHelperToMap(const GraphId &graph_id, uint64_t se
   }
 }
 
+ModelCacheHelperPtr GraphManager::FindModelCacheHelper(GraphId graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  auto iter = cache_helper_map_.find(graph_id);
+  if (iter != cache_helper_map_.end()) {
+    return iter->second;
+  }
+
+  return nullptr;
+}
+
 Status GraphManager::IncreBuild(const GraphNodePtr &graph_node, GeModelPtr &ge_model) {
   std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
   if (instance_ptr == nullptr || !instance_ptr->IsIncreBuild()) {
     return FAILED;
   }
   const uint32_t graph_id = graph_node->GetGraphId();
-  auto iter = cache_helper_map_.find(graph_id);
-  if (iter == cache_helper_map_.end()) {
+  ModelCacheHelperPtr cache_helper = FindModelCacheHelper(graph_id);
+  if (cache_helper == nullptr) {
     GELOGW("Can not find ModelCacheHelper of graph[%u]", graph_id);
     return FAILED;
   }
-  ModelCacheHelperPtr cache_helper = iter->second;
   if (cache_helper->IsModelCacheHit()) {
     GEEVENT("Model cache hit.");
     Status ret = LoadFromCache(graph_node, cache_helper, ge_model);
@@ -2368,7 +2442,6 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
   if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) {
     GELOGW("Set thread name failed.");
   }
-  SetLocalOmgContext(graph_manager->omg_context_);
 
   PreRunArgs args;
   while (graph_manager->thread_run_flag_) {
@@ -2376,8 +2449,13 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
     if (!pop_status) {
       continue;
     }
-    GetThreadLocalContext() = args.context;
+
     GELOGI("A new loop start.");
+
+    GetContext().SetSessionId(args.session_id);
+    GetThreadLocalContext() = args.context;
+    graph_manager->UpdateLocalOmgContext(args.graph_id);
+
     std::vector<ge::GeTensor> ge_inputs;
     ConstructGeInput(ge_inputs, args);
 
@@ -2398,6 +2476,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
       graph_node->Unlock();
       return;
     }
+
     // set graph's run flag
     graph_node->SetRunFlag(true);
 
@@ -2414,7 +2493,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
     std::vector<GeModelPtr> ge_models;
 
     if (graph_manager->options_.local_fmk_op_flag) {
-      graph_manager->graph_optimize_.TranFrameOp(compute_graph_tmp);
+      graph_manager->GetCompilerStages(graph_node->GetGraphId()).optimizer.TranFrameOp(compute_graph_tmp);
     }
 
     // it will not execute graph preprocess, optimize, parition, build if the graph has built successful.
@@ -2457,8 +2536,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
       ge_root_model = graph_node->GetGeRootModel();
     }
 
-    graph_manager->run_args_q_.Push(
-      RunArgs({graph_node, args.graph_id, args.input_tensor, ge_root_model, GetThreadLocalContext(), args.callback}));
+    graph_manager->run_args_q_.Push(RunArgs({graph_node, args.graph_id, args.session_id, args.input_tensor,
+                                             ge_root_model, GetThreadLocalContext(), args.callback}));
     GELOGI("Loop end.");
   }
 }
@@ -2467,7 +2546,6 @@ void GraphManager::RunThread(GraphManager *graph_manager) {
   if (prctl(PR_SET_NAME, ("GE_Run")) != 0) {
     GELOGW("Set thread name failed.");
   }
-  SetLocalOmgContext(graph_manager->omg_context_);
 
   RunArgs args;
   while (graph_manager->thread_run_flag_) {
@@ -2475,8 +2553,13 @@ void GraphManager::RunThread(GraphManager *graph_manager) {
     if (!pop_status) {
       continue;
     }
+
     GELOGI("A new loop start.");
+
+    GetContext().SetSessionId(args.session_id);
     GetThreadLocalContext() = args.context;
+    graph_manager->UpdateLocalOmgContext(args.graph_id);
+
     if (args.graph_node->graph_run_async_listener_ != nullptr) {
       args.graph_node->graph_run_async_listener_->SetCallback(args.callback);
     }
@@ -2632,10 +2715,19 @@ void GraphManager::SetOptionsRunGraphFlag(bool run_graph_flag) { options_.run_gr
 Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph,
                                       uint64_t session_id) {
   // graph partition
+  // Stage partition, only for root graph
+  GE_TIMESTAMP_START(StagePartition);
+  StagePartitioner stage_partitioner(compute_graph);
+  auto ret = stage_partitioner.Partition();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Graph partition by stage Failed");
+    return ret;
+  }
+  GE_TIMESTAMP_EVENT_END(StagePartition, "OptimizeSubgraph::StagePartition");
   // all sub graph list of root graph and sub graph
   GE_TIMESTAMP_START(GraphPartitionDynamicShape);
   DynamicShapePartitioner dynamic_shape_partitioner(compute_graph);
-  auto ret = dynamic_shape_partitioner.Partition();
+  ret = dynamic_shape_partitioner.Partition();
   if (ret != SUCCESS) {
     GELOGE(ret, "Graph partition by dynamic shape Failed");
     return ret;
@@ -2647,14 +2739,15 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
   }
   GE_TIMESTAMP_EVENT_END(GraphPartitionDynamicShape, "OptimizeSubgraph::GraphPartitionDynamicShape");
   GE_TIMESTAMP_START(GraphPartition);
-  ret = graph_partitioner_.Partition(compute_graph, GraphPartitioner::kPartitioning);
+  GraphPartitioner &partitioner = GetCompilerStages(graph_node->GetGraphId()).partitioner;
+  ret = partitioner.Partition(compute_graph, GraphPartitioner::kPartitioning);
   if (ret != SUCCESS) {
     GELOGE(ret, "Graph partition Failed");
     return ret;
   }
   GE_TIMESTAMP_EVENT_END(GraphPartition, "OptimizeSubgraph::Partition1");
   GE_TIMESTAMP_START(SetSubgraph);
-  ret = SetSubgraph(session_id, compute_graph);
+  ret = SetSubgraph(session_id, compute_graph, partitioner);
   if (ret != SUCCESS) {
     GELOGE(ret, "Graph set subgraph Failed");
     return ret;
@@ -2666,7 +2759,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
     GE_TIMESTAMP_START(ConvertGraphToFile);
     std::string tuning_path;
     (void)GetContext().GetOption(TUNING_PATH, tuning_path);
-    Status ret = ConvertGraphToFile(compute_graph, tuning_path, (options_.build_step == BUILD_STEP_AFTER_BUILDER));
+    Status ret =
+      ConvertGraphToFile(compute_graph, partitioner, tuning_path, (options_.build_step == BUILD_STEP_AFTER_BUILDER));
     if (ret != SUCCESS) {
       GELOGE(ret, "Convert graph[%s] to file failed", compute_graph->GetName().c_str());
       return ret;
@@ -2679,7 +2773,7 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
   std::vector<ComputeGraphPtr> merged_sub_graph_list;
 
   GE_TIMESTAMP_START(MergeSubgraph);
-  ret = MergeSubGraph(merged_compute_graph, compute_graph);
+  ret = MergeSubGraph(merged_compute_graph, compute_graph, graph_node->GetGraphId());
   if (ret != SUCCESS) {
     GELOGE(ret, "Merge SubGraph Failed");
     return ret;
@@ -2702,16 +2796,17 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
   return SUCCESS;
 }
 
-Status GraphManager::ConvertGraphToFile(ComputeGraphPtr &compute_graph, std::string path, bool exe_flag) {
+Status GraphManager::ConvertGraphToFile(ComputeGraphPtr &compute_graph, GraphPartitioner &partitioner, std::string path,
+                                        bool exe_flag) {
   GE_CHECK_NOTNULL(compute_graph);
   GELOGI("compute_graph [%s] path [%s] Enter ConvertGraphToFile.", compute_graph->GetName().c_str(), path.c_str());
   std::vector<ComputeGraphPtr> non_tuning_subgraphs;
-  auto input_node_sub_graph_map = graph_partitioner_.graph_2_input_subgraph_;
+  auto input_node_sub_graph_map = partitioner.graph_2_input_subgraph_;
   const auto &input_subgraph_info = input_node_sub_graph_map[compute_graph];
   GE_CHECK_NOTNULL(input_subgraph_info);
   ComputeGraphPtr input_graph_tmp = input_subgraph_info->GetSubGraph();
   non_tuning_subgraphs.push_back(input_graph_tmp);
-  auto sub_graph_map = graph_partitioner_.GetSubGraphMap();
+  auto sub_graph_map = partitioner.GetSubGraphMap();
   const auto &subgraph_infos = sub_graph_map[compute_graph];
   std::vector<ComputeGraphPtr> tuning_subgraphs;
   for (const auto &sub_graph_info_ptr : subgraph_infos) {
@@ -2737,7 +2832,8 @@ Status GraphManager::Build(const GraphNodePtr &graph_node, ComputeGraphPtr &comp
     compute_graph->SetName(graph_name);
   }
   std::vector<SubGraphInfoPtr> sub_graph_list;
-  auto ret = graph_builder_.Build(compute_graph, sub_graph_list, ge_root_model, session_id);
+  auto ret =
+    GetCompilerStages(graph_node->GetGraphId()).builder.Build(compute_graph, sub_graph_list, ge_root_model, session_id);
   if (ret != SUCCESS) {
     GELOGE(ret, "SubGraph build Failed.");
     return ret;
@@ -2856,4 +2952,30 @@ Status GraphManager::SaveCheckPointResult(const Graph &graph, const std::vector<
   }
   return SUCCESS;
 }
+
+void GraphManager::AddLocalOmgContext(GraphId graph_id, const OmgContext &omg_context) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  omg_contexts_.emplace(graph_id, omg_context);
+  SetLocalOmgContext(omg_contexts_[graph_id]);
+}
+
+void GraphManager::UpdateLocalOmgContext(GraphId graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  auto iter = omg_contexts_.find(graph_id);
+  if (iter != omg_contexts_.end()) {
+    SetLocalOmgContext(iter->second);
+  } else {
+    GELOGW("OmgContext of graph %u not found.", graph_id);
+  }
+}
+
+GraphManager::CompilerStages &GraphManager::GetCompilerStages(GraphId graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  return compiler_stages_[graph_id];
+}
+
+void GraphManager::RemoveCompilerStages(GraphId graph_id) {
+  std::lock_guard<std::mutex> lock(member_mutex_);
+  compiler_stages_.erase(graph_id);
+}
 }  // namespace ge
diff --git a/src/ge/graph/manager/graph_manager.h b/src/ge/graph/manager/graph_manager.h
index 9096f4a8..0a492cfb 100644
--- a/src/ge/graph/manager/graph_manager.h
+++ b/src/ge/graph/manager/graph_manager.h
@@ -45,8 +45,7 @@
 namespace ge {
 class GraphManager {
  public:
-  GraphManager(OmgContext &omg_context);
-
+  GraphManager();
   ~GraphManager() = default;
 
   ///
@@ -71,7 +70,8 @@ class GraphManager {
   /// @param [out] Graph output graph
   /// @return Status result of function
   ///
-  Status AddGraph(const GraphId &graph_id, const Graph &graph, const std::map<std::string, std::string> &options);
+  Status AddGraph(const GraphId &graph_id, const Graph &graph, const std::map<std::string, std::string> &options,
+                  const OmgContext &omg_context);
 
   ///
   /// @ingroup ge_graph
@@ -168,6 +168,13 @@ class GraphManager {
   Status SaveCheckPointResult(const Graph &graph, const std::vector<Tensor> &outputs, map<string, Tensor> &var_results);
 
  private:
+  struct CompilerStages {
+    GraphPrepare preparer;
+    GraphOptimize optimizer;
+    GraphPartitioner partitioner;
+    GraphBuilder builder;
+  };
+
   struct PreRunArgs {
     GraphId graph_id;
     std::vector<ge::InputTensorInfo> input_tensor;
@@ -179,18 +186,23 @@ class GraphManager {
   struct RunArgs {
     GraphNodePtr graph_node;
     GraphId graph_id;
+    uint64_t session_id;
     std::vector<ge::InputTensorInfo> input_tensor;
     GeRootModelPtr ge_root_model;
     GEThreadLocalContext context;
     RunAsyncCallback callback;
   };
 
+  void AddGraphNode(GraphId graph_id, const GraphNodePtr &graph_node);
+  void RemoveGraphNode(GraphId graph_id);
+  bool HasGraphNode(GraphId graph_id);
   Status GetGraphNode(const GraphId &graph_id, GraphNodePtr &out);
 
   std::shared_ptr<GraphModelListener> GetModelListener() const { return graph_run_listener_; }
 
-  static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, const SubGraphInfoPtr &sub_graph_info_ptr,
-                                                uint64_t session_id, const GEThreadLocalContext &ge_context);
+  static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, GraphId root_graph_id,
+                                                const SubGraphInfoPtr &sub_graph_info_ptr, uint64_t session_id,
+                                                const GEThreadLocalContext &ge_context);
   Status PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs, GeRootModelPtr &ge_root_model,
                 uint64_t session_id = INVALID_SESSION_ID);
 
@@ -247,11 +259,13 @@ class GraphManager {
 
   bool CheckTransOpForCheckpointGraph(NodePtr &node);
 
-  Status MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph);
+  Status MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph,
+                       GraphId root_graph_id);
 
-  Status ConvertGraphToFile(ComputeGraphPtr &compute_graph, std::string file_path, bool exe_flag = false);
+  Status ConvertGraphToFile(ComputeGraphPtr &compute_graph, GraphPartitioner &partitioner, std::string file_path,
+                            bool exe_flag = false);
 
-  Status SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph);
+  Status SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph, GraphPartitioner &partitioner);
 
   void SetAttrForHcomBroadCastOp(ge::ComputeGraphPtr &compute_graph);
 
@@ -296,6 +310,7 @@ class GraphManager {
   void AddModelCacheHelperToMap(const GraphId &graph_id, uint64_t session_id, ComputeGraphPtr &compute_graph);
   Status IncreBuild(const GraphNodePtr &graph_node, GeModelPtr &ge_model);
   void RemoveModelCacheHelper(const GraphId &graph_id);
+  ModelCacheHelperPtr FindModelCacheHelper(GraphId graph_id);
 
   static void ConstructGeInput(std::vector<ge::GeTensor> &ge_inputs, PreRunArgs &args);
   static void PreRunThread(GraphManager *graph_manager);
@@ -326,6 +341,12 @@ class GraphManager {
                                      std::unordered_map<std::string, ComputeGraphPtr> &copy_graphs);
   Status SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint64_t session_id, uint32_t graph_id);
 
+  void AddLocalOmgContext(GraphId graph_id, const OmgContext &omg_context);
+  void UpdateLocalOmgContext(GraphId graph_id);
+
+  CompilerStages &GetCompilerStages(GraphId graph_id);
+  void RemoveCompilerStages(GraphId graph_id);
+
   std::atomic_bool thread_run_flag_;
   BlockingQueue<PreRunArgs> prerun_args_q_{};
   BlockingQueue<RunArgs> run_args_q_{};
@@ -333,7 +354,6 @@ class GraphManager {
   std::thread run_thread_;
 
   std::map<GraphId, GraphNodePtr> graph_map_;
-
   std::map<GraphId, ModelCacheHelperPtr> cache_helper_map_;
 
   // for run graph synchronous return
@@ -348,19 +368,18 @@ class GraphManager {
   bool init_flag_;
 
   GraphManagerOptions options_;
-  OmgContext &omg_context_;
+  GraphContextPtr graph_context_ = nullptr;
+  map<GraphId, OmgContext> omg_contexts_;
 
-  GraphPrepare graph_preparer_;
-  GraphOptimize graph_optimize_;
-  GraphPartitioner graph_partitioner_;
-  GraphBuilder graph_builder_;
-  GraphLoader graph_loader_;
+  map<GraphId, CompilerStages> compiler_stages_;
   GraphExecutor graph_executor_;
-  GraphContextPtr graph_context_ = nullptr;
 
   VarAccelerateCtrl var_acc_ctrl_;
 
   std::mutex run_mutex_;
+
+  std::mutex member_mutex_;
+  std::mutex unload_model_mutex_;
 };
 }  // namespace ge
 
diff --git a/src/ge/graph/manager/host_mem_manager.cc b/src/ge/graph/manager/host_mem_manager.cc
index 1d35f7af..d4aceddd 100644
--- a/src/ge/graph/manager/host_mem_manager.cc
+++ b/src/ge/graph/manager/host_mem_manager.cc
@@ -18,20 +18,46 @@
 
 #include <sstream>
 
+#include "graph/ge_context.h"
 #include "graph/utils/tensor_utils.h"
+#include "runtime/mem.h"
 
+namespace {
+const uint32_t kMallocHostMemFlag = 0;
+}  // namespace
 namespace ge {
-Status HostMemoryAllocator::Allocate(std::size_t memory_size, uint8_t *memory_addr) {
-  GELOGI("HostMemoryAllocator::MallocMemory size= %zu.", memory_size);
+Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) {
+  auto device_id = GetContext().DeviceId();
+  GELOGD("SharedMemAllocator::Malloc host mem size= %zu for devid:[%u].", mem_info.mem_size, device_id);
+
+  auto dev_id = static_cast<int32_t>(device_id);
+  GE_CHK_RT_RET(rtSetDevice(dev_id));
+  // DeviceReset before memory finished!
+  GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(dev_id)); });
+
+  rtMallocHostSharedMemoryIn input_para = {mem_info.shm_name.c_str(), mem_info.mem_size, kMallocHostMemFlag};
+  rtMallocHostSharedMemoryOut output_para;
+  rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt api(rtMallocHostSharedMemory) failed, devid:[%u].", device_id);
+    return GE_GRAPH_MEMORY_ALLOC_FAILED;
+  }
+  mem_info.fd = output_para.fd;
+  mem_info.host_address = reinterpret_cast<uint8_t *>(output_para.ptr);
+  mem_info.device_address = reinterpret_cast<uint8_t *>(output_para.devPtr);
   return SUCCESS;
 }
 
-Status HostMemoryAllocator::DeAllocate(uint8_t *memory_addr) {
-  if (rtFreeHost(memory_addr) != RT_ERROR_NONE) {
-    GELOGE(GE_GRAPH_FREE_FAILED, "MemoryAllocator::Free memory failed.");
-    return GE_GRAPH_FREE_FAILED;
+Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) {
+  GELOGD("SharedMemAllocator::DeAllocate");
+  rtFreeHostSharedMemoryIn free_para = {mem_info.shm_name.c_str(), mem_info.mem_size, mem_info.fd,
+                                        mem_info.host_address, mem_info.device_address};
+
+  rtError_t rt_ret = rtFreeHostSharedMemory(&free_para);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret);
+    return RT_FAILED;
   }
-  memory_addr = nullptr;
   return ge::SUCCESS;
 }
 
@@ -42,9 +68,9 @@ HostMemManager &HostMemManager::Instance() {
 
 Status HostMemManager::Initialize() {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
-  allocator_ = std::unique_ptr<HostMemoryAllocator>(new (std::nothrow) HostMemoryAllocator());
+  allocator_ = std::unique_ptr<SharedMemAllocator>(new (std::nothrow) SharedMemAllocator());
   if (allocator_ == nullptr) {
-    GELOGE(GE_GRAPH_MALLOC_FAILED, "Host mem allocator init failed!");
+    GELOGE(GE_GRAPH_MALLOC_FAILED, "Shared memory allocator init failed!");
     return GE_GRAPH_MALLOC_FAILED;
   }
   return SUCCESS;
@@ -52,35 +78,43 @@ Status HostMemManager::Initialize() {
 
 void HostMemManager::Finalize() noexcept {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
-
-  for (const auto &it : var_memory_base_map_) {
-    if (allocator_->DeAllocate(it.second.address) != SUCCESS) {
-      GELOGW("Host %s mem deAllocator failed!", it.first.c_str());
+  for (auto &it : var_memory_base_map_) {
+    if (allocator_->DeAllocate(it.second) != SUCCESS) {
+      GELOGW("Host %s mem release failed!", it.first.c_str());
     }
   }
   var_memory_base_map_.clear();
 }
 
-Status HostMemManager::MallocMemoryForHostVar(const string &op_name, uint64_t tensor_size, uint8_t *&var_addr) {
+Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
-  if (var_memory_base_map_.find(op_name) != var_memory_base_map_.end()) {
-    GELOGI("Host mem for variable %s has been malloced", op_name.c_str());
-    return SUCCESS;
+  auto iter = var_memory_base_map_.find(mem_info.op_name);
+  if (iter != var_memory_base_map_.end()) {
+    GELOGE(FAILED, "Host shared memory for op %s has been malloced", mem_info.op_name.c_str());
+    return FAILED;
   }
+  mem_info.shm_name = OpNameToShmName(mem_info.op_name);
   GE_CHECK_NOTNULL(allocator_);
-  GE_CHK_STATUS(allocator_->Allocate(tensor_size, var_addr));
-  HostMemInfo info(var_addr, tensor_size);
-  var_memory_base_map_[op_name] = info;
+  GE_CHK_STATUS_RET(allocator_->Allocate(mem_info));
+  var_memory_base_map_[mem_info.op_name] = mem_info;
   return SUCCESS;
 }
 
 Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
   if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) {
     GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str());
     return INTERNAL_ERROR;
   }
-  base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].address));
-  data_size = var_memory_base_map_[op_name].data_size;
+  base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address));
+  data_size = var_memory_base_map_[op_name].mem_size;
   return SUCCESS;
 }
+
+string HostMemManager::OpNameToShmName(const string &op_name) {
+  string sh_name("Ascend_");
+  std::hash<std::string> hash_str;
+  sh_name.append(std::to_string(hash_str(op_name)));
+  return sh_name;
+}
 }  // namespace ge
diff --git a/src/ge/graph/manager/host_mem_manager.h b/src/ge/graph/manager/host_mem_manager.h
index 3a5a0602..66bd5826 100644
--- a/src/ge/graph/manager/host_mem_manager.h
+++ b/src/ge/graph/manager/host_mem_manager.h
@@ -24,6 +24,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "framework/common/ge_inner_error_codes.h"
@@ -35,19 +36,23 @@
 #include "runtime/mem.h"
 
 namespace ge {
-class HostMemoryAllocator {
- public:
-  ~HostMemoryAllocator() = default;
-
-  Status Allocate(std::size_t size, uint8_t *memory_addr);
-  Status DeAllocate(uint8_t *memory_addr);
+struct SharedMemInfo {
+  string op_name;
+  string shm_name;
+  uint64_t mem_size = 0;
+  int fd = 0;
+  uint8_t *device_address = nullptr;
+  uint8_t *host_address = nullptr;
+  SharedMemInfo() = default;
+  SharedMemInfo(string name, uint64_t size) : op_name(std::move(name)), mem_size(size) {}
 };
+class SharedMemAllocator {
+ public:
+  SharedMemAllocator() = default;
+  ~SharedMemAllocator() = default;
 
-struct HostMemInfo {
-  uint8_t *address;
-  uint64_t data_size;
-  HostMemInfo() : address(nullptr), data_size(0) {}
-  HostMemInfo(uint8_t *addr, uint64_t size) : address(addr), data_size(size) {}
+  Status Allocate(SharedMemInfo &mem_info);
+  Status DeAllocate(SharedMemInfo &mem_info);
 };
 
 class HostMemManager {
@@ -60,12 +65,13 @@ class HostMemManager {
   static HostMemManager &Instance();
   Status Initialize();
   void Finalize() noexcept;
-  Status MallocMemoryForHostVar(const string &op_name, uint64_t tensor_size, uint8_t *&var_addr);
+  Status MallocSharedMemory(SharedMemInfo &mem_nfo);
   Status QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size);
 
  private:
-  std::unordered_map<std::string, HostMemInfo> var_memory_base_map_;
-  std::unique_ptr<HostMemoryAllocator> allocator_;
+  static string OpNameToShmName(const string &op_name);
+  std::unordered_map<std::string, SharedMemInfo> var_memory_base_map_;
+  std::unique_ptr<SharedMemAllocator> allocator_;
   mutable std::recursive_mutex mutex_;
 };
 }  // namespace ge
diff --git a/src/ge/graph/manager/memory_api.cc b/src/ge/graph/manager/memory_api.cc
index 0a98e983..4c75276c 100644
--- a/src/ge/graph/manager/memory_api.cc
+++ b/src/ge/graph/manager/memory_api.cc
@@ -18,11 +18,13 @@
 
 #include <memory>
 
+#include "common/ge/plugin_manager.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/host_mem_manager.h"
 #include "graph/manager/rdma_pool_allocator.h"
+#include "graph/utils/type_utils.h"
 #include "hccl/base.h"
-#include "hccl/hcom.h"
+#include "hccl/hccl_types.h"
 
 namespace ge {
 Status InitRdmaPool(size_t size, rtMemType_t mem_type) {
@@ -35,6 +37,71 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t
   uint64_t device_base = 0;
   uint64_t device_size = 0;
   GE_CHK_STATUS_RET(MemManager::Instance().RdmaPoolInstance(mem_type).GetBaseAddr(device_base, device_size));
+  auto table_len = var_info.size() + 1;
+  std::unique_ptr<MemRegisterAddr[]> reg_addrs(new (std::nothrow) MemRegisterAddr[table_len]);
+  GE_CHECK_NOTNULL(reg_addrs);
+  for (size_t i = 0; i < var_info.size(); ++i) {
+    reg_addrs[i] = {var_info[i].base_addr, var_info[i].var_size};
+  }
+  reg_addrs[table_len - 1] = {device_base, device_size};
+
+  std::string file_name = "libhccl.so";
+  std::string path = PluginManager::GetPath();
+  path.append(file_name);
+  string canonical_path = RealPath(path.c_str());
+  if (canonical_path.empty()) {
+    GELOGE(FAILED, "Failed to get realpath of %s", path.c_str());
+    return FAILED;
+  }
+  GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str());
+  auto handle = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+  GE_CHECK_NOTNULL(handle);
+  GE_MAKE_GUARD(not_used_var, [&] {
+    if (dlclose(handle) != 0) {
+      GELOGW("Failed to close handle %s", dlerror());
+    }
+  });
+
+  auto hcom_remote_mem_register =
+    (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "hcom_remote_access_mem_register");
+  if (hcom_remote_mem_register == nullptr) {
+    GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function.");
+    return FAILED;
+  }
+
+  HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len);
+  if (hccl_ret != HCCL_SUCCESS) {
+    GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret);
+    return HCCL_E_INTERNAL;
+  }
+  return SUCCESS;
+}
+
+Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size) {
+  GELOGD("MallocSharedMemory in");
+  uint32_t type_size = 0;
+  bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size);
+  if (!result) {
+    GELOGE(GRAPH_FAILED, "GetDataTypeLength failed, data_type=(%s).",
+           TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str());
+    return GRAPH_FAILED;
+  }
+  memory_size = type_size;
+  for (auto dim : tensor_info.dims) {
+    if (dim <= 0) {
+      GELOGE(GRAPH_FAILED, "Tensor dims should be positive");
+      return GRAPH_FAILED;
+    }
+    memory_size *= dim;
+  }
+  SharedMemInfo mem_info(tensor_info.var_name, memory_size);
+  Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info);
+  if (ret != SUCCESS) {
+    GELOGE(GRAPH_FAILED, "MallocSharedMemory failed op name [%s]", tensor_info.var_name.c_str());
+    return GRAPH_FAILED;
+  }
+  dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address));
+  GELOGD("MallocSharedMemory Succeeded");
   return SUCCESS;
 }
 
@@ -42,4 +109,4 @@ Status GetVarBaseAddrAndSize(const string &var_name, uint64_t &base_addr, uint64
   GELOGD("GetVarBaseAddrAndSize in");
   return HostMemManager::Instance().QueryVarMemInfo(var_name, base_addr, var_size);
 }
-}  // namespace ge
\ No newline at end of file
+}  // namespace ge
diff --git a/src/ge/graph/manager/rdma_pool_allocator.cc b/src/ge/graph/manager/rdma_pool_allocator.cc
index ef82deff..feea06d9 100644
--- a/src/ge/graph/manager/rdma_pool_allocator.cc
+++ b/src/ge/graph/manager/rdma_pool_allocator.cc
@@ -140,8 +140,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
       block->size = aligned_size;
       block_bin_.insert(new_block);
     }
-    return block->ptr;
     GELOGD("Find block size = %zu", block->size);
+    return block->ptr;
   }
   GELOGW("Memory block not founded.");
   return nullptr;
diff --git a/src/ge/graph/manager/util/hcom_util.h b/src/ge/graph/manager/util/hcom_util.h
index 064058f8..ad2e3406 100644
--- a/src/ge/graph/manager/util/hcom_util.h
+++ b/src/ge/graph/manager/util/hcom_util.h
@@ -35,17 +35,14 @@ using std::string;
 using std::vector;
 
 static std::map<int64_t, HcclDataType> kConstOpHcclDataType = {
-  {ge::DT_FLOAT, HCCL_DATA_TYPE_FP32},
-  {ge::DT_FLOAT16, HCCL_DATA_TYPE_FP16},
-  {ge::DT_INT8, HCCL_DATA_TYPE_INT8},
-  {ge::DT_INT32, HCCL_DATA_TYPE_INT32},
+  {ge::DT_FLOAT, HCCL_DATA_TYPE_FP32},  {ge::DT_FLOAT16, HCCL_DATA_TYPE_FP16}, {ge::DT_INT8, HCCL_DATA_TYPE_INT8},
+  {ge::DT_INT32, HCCL_DATA_TYPE_INT32}, {ge::DT_INT64, HCCL_DATA_TYPE_INT64},  {ge::DT_UINT64, HCCL_DATA_TYPE_UINT64},
 };
 
 static std::map<HcclDataType, int32_t> kConstOpHcclDataTypeSize = {
-  {HCCL_DATA_TYPE_FP32, sizeof(float)},
-  {HCCL_DATA_TYPE_FP16, sizeof(float) / 2},
-  {HCCL_DATA_TYPE_INT8, sizeof(int8_t)},
-  {HCCL_DATA_TYPE_INT32, sizeof(int32_t)},
+  {HCCL_DATA_TYPE_FP32, sizeof(float)},    {HCCL_DATA_TYPE_FP16, sizeof(float) / 2},
+  {HCCL_DATA_TYPE_INT8, sizeof(int8_t)},   {HCCL_DATA_TYPE_INT32, sizeof(int32_t)},
+  {HCCL_DATA_TYPE_INT64, sizeof(int64_t)}, {HCCL_DATA_TYPE_UINT64, sizeof(uint64_t)},
 };
 
 static std::map<HorovodReduceOp, HcclReduceOp> kHorovodRedOpToHcclRedOp = {
diff --git a/src/ge/graph/manager/util/variable_accelerate_ctrl.cc b/src/ge/graph/manager/util/variable_accelerate_ctrl.cc
index b62be02c..726e72be 100644
--- a/src/ge/graph/manager/util/variable_accelerate_ctrl.cc
+++ b/src/ge/graph/manager/util/variable_accelerate_ctrl.cc
@@ -28,6 +28,7 @@ inline bool IsVariable(const std::string &node_type) {
 }  // namespace
 
 bool VarAccelerateCtrl::IsVarPermitToChangeFormats(const std::string &var_name) {
+  std::lock_guard<std::mutex> lock(mutex_);
   auto iter = var_names_to_change_times_.find(var_name);
   if (iter == var_names_to_change_times_.end()) {
     return true;
@@ -36,6 +37,7 @@ bool VarAccelerateCtrl::IsVarPermitToChangeFormats(const std::string &var_name)
 }
 
 void VarAccelerateCtrl::SetVarChanged(const std::string &var_name) {
+  std::lock_guard<std::mutex> lock(mutex_);
   auto times = ++var_names_to_change_times_[var_name];
   for (auto &graph_id_to_var_names : graph_ids_to_var_names_) {
     if (graph_id_to_var_names.second.count(var_name) > 0) {
@@ -51,6 +53,7 @@ void VarAccelerateCtrl::SetVarChanged(const std::string &var_name) {
 }
 
 void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) {
+  std::lock_guard<std::mutex> lock(mutex_);
   if (compute_graph == nullptr) {
     GELOGE(PARAM_INVALID, "Failed to add graph %u, the compute graph is null", graph_id);
     return;
@@ -67,14 +70,19 @@ void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compu
 }
 
 void VarAccelerateCtrl::RemoveGraph(uint32_t graph_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
   GELOGD("Remove graph %u", graph_id);
   graph_ids_to_var_names_.erase(graph_id);
   graph_ids_need_rebuild_.erase(graph_id);
 }
+
 bool VarAccelerateCtrl::IsGraphNeedRebuild(uint32_t graph_id) const {
+  std::lock_guard<std::mutex> lock(mutex_);
   return graph_ids_need_rebuild_.count(graph_id) > 0;
 }
+
 void VarAccelerateCtrl::SetGraphBuildEnd(uint32_t graph_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
   graph_ids_need_rebuild_.erase(graph_id);
   GELOGD("The graph %u has built end, remove it from the rebuild-set", graph_id);
 }
diff --git a/src/ge/graph/manager/util/variable_accelerate_ctrl.h b/src/ge/graph/manager/util/variable_accelerate_ctrl.h
index d8504c02..a7ff04c2 100644
--- a/src/ge/graph/manager/util/variable_accelerate_ctrl.h
+++ b/src/ge/graph/manager/util/variable_accelerate_ctrl.h
@@ -20,6 +20,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <mutex>
 
 #include "graph/compute_graph.h"
 #include "graph/node.h"
@@ -59,6 +60,8 @@ class VarAccelerateCtrl {
   ///
   std::map<std::string, int> var_names_to_change_times_;
   static const int kMaxVarChangeTimes_ = 1;
+
+  mutable std::mutex mutex_;
 };
 }  // namespace ge
 
diff --git a/src/ge/graph/optimize/graph_optimize.cc b/src/ge/graph/optimize/graph_optimize.cc
index 214f68eb..ad919338 100644
--- a/src/ge/graph/optimize/graph_optimize.cc
+++ b/src/ge/graph/optimize/graph_optimize.cc
@@ -30,12 +30,7 @@ const char *const kAicoreEngine = "AIcoreEngine";
 
 namespace ge {
 GraphOptimize::GraphOptimize()
-    : optimize_type_(domi::FrameworkType::TENSORFLOW),
-      cal_config_(""),
-      insert_op_config_(""),
-      parse_out_node_(""),
-      core_type_(""),
-      graph_context_(nullptr) {}
+    : optimize_type_(domi::FrameworkType::TENSORFLOW), cal_config_(""), insert_op_config_(""), core_type_("") {}
 
 void AddNodeInputProperty(ComputeGraphPtr &compute_graph) {
   if (compute_graph == nullptr) {
@@ -107,7 +102,7 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std
       for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) {
         Status ret = (*iter)->OptimizeFusedGraphAfterGraphSlice(*(compute_graph));
         if (ret != SUCCESS) {
-          GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphStage2]: graph optimize failed, ret:%d", ret);
+          GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphAfterGraphSlice]: graph optimize failed, ret:%d", ret);
           return ret;
         }
       }
diff --git a/src/ge/graph/optimize/graph_optimize.h b/src/ge/graph/optimize/graph_optimize.h
index 3d2db782..b4a19c3f 100644
--- a/src/ge/graph/optimize/graph_optimize.h
+++ b/src/ge/graph/optimize/graph_optimize.h
@@ -60,7 +60,7 @@ class GraphOptimize {
 
   const std::map<uint32_t, std::map<string, size_t>> &GetSummaryOutputIndexes() const {
     return summary_output_indexes_;
-  }  // lint !e1073
+  }
 
   // handle summary node before preRun graph
   Status HandleSummaryOp(ComputeGraphPtr &compute_graph);
@@ -79,12 +79,8 @@ class GraphOptimize {
   domi::FrameworkType optimize_type_;
   std::string cal_config_;
   std::string insert_op_config_;
-  std::string parse_out_node_;
   std::string core_type_;
-  std::vector<std::string> out_nodes_name_;
-  std::vector<int32_t> out_nodes_index_;
   bool train_graph_flag_ = false;
-  GraphContextPtr graph_context_;
   bool local_fmk_op_flag_ = false;
   // record the summary names for filter sumarry result.
   std::map<uint32_t, std::map<string, size_t>> summary_output_indexes_ = {};
diff --git a/src/ge/graph/optimize/mem_rw_conflict_optimize.cc b/src/ge/graph/optimize/mem_rw_conflict_optimize.cc
index 3c3419ae..d59f5928 100644
--- a/src/ge/graph/optimize/mem_rw_conflict_optimize.cc
+++ b/src/ge/graph/optimize/mem_rw_conflict_optimize.cc
@@ -568,7 +568,7 @@ Status SplitIdentity(const NodePtr &node) {
 Status InsertIdentityAsNeeded(const NodePtr &node) {
   auto op_desc = node->GetOpDesc();
   GE_CHECK_NOTNULL(op_desc);
-  if (node->GetOutDataNodesSize() == 0 || node->GetInDataNodes().empty()) {
+  if (node->GetOutDataNodesSize() == 0) {
     return SUCCESS;
   }
   for (const auto &out_data_anchor : node->GetAllOutDataAnchors()) {
diff --git a/src/ge/graph/partition/dynamic_shape_partition.cc b/src/ge/graph/partition/dynamic_shape_partition.cc
index d1b00f12..b1fe40b5 100644
--- a/src/ge/graph/partition/dynamic_shape_partition.cc
+++ b/src/ge/graph/partition/dynamic_shape_partition.cc
@@ -354,19 +354,34 @@ Status DynamicShapePartitioner::MergeClusters() {
   return SUCCESS;
 }
 
+bool DynamicShapePartitioner::JudgeUnknowShapeWithAttr(const OpDescPtr &opdesc) {
+  bool is_forced_unknown = false;
+  if (AttrUtils::GetBool(opdesc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_forced_unknown) && is_forced_unknown) {
+    GELOGD("Collect node %s as unknown as it was marked unknown forcibly.", opdesc->GetName().c_str());
+    return true;
+  }
+
+  bool forced_unknown = false;
+  if (AttrUtils::GetBool(opdesc, ATTR_NAME_FORCE_UNKNOWN_SHAPE, forced_unknown) && forced_unknown) {
+    GELOGD("Collect node %s as unknown as it was marked force unknown node forcibly.", opdesc->GetName().c_str());
+    return true;
+  }
+  return false;
+}
+
 Status DynamicShapePartitioner::CollectSpreadUnknownShapeNodes(NodePtr node) {
   if (unknown_shape_nodes_.count(node) > 0) {
     return SUCCESS;
   }
   auto opdesc = node->GetOpDesc();
+  REQUIRE_NOT_NULL(opdesc, "Opdesc is nullptr.");
   // One can set 'ATTR_NAME_IS_UNKNOWN_SHAPE=true' on node so as to forcing the node flow into the unknown subgraph,
   // ignore the actual shape.
-  bool is_forced_unknown = false;
-  if (AttrUtils::GetBool(opdesc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_forced_unknown) && is_forced_unknown) {
-    GELOGD("Collect node %s as unknown as it was marked unknown forcibly.", node->GetName().c_str());
+  if (JudgeUnknowShapeWithAttr(opdesc)) {
     unknown_shape_nodes_.insert(node);
     return SUCCESS;
   }
+
   size_t anchor_index = 0;
   bool is_unknown = false;
   for (auto &out_tensor : opdesc->GetAllOutputsDesc()) {
diff --git a/src/ge/graph/partition/dynamic_shape_partition.h b/src/ge/graph/partition/dynamic_shape_partition.h
index 06a94833..f2e5ba24 100644
--- a/src/ge/graph/partition/dynamic_shape_partition.h
+++ b/src/ge/graph/partition/dynamic_shape_partition.h
@@ -145,6 +145,7 @@ class DynamicShapePartitioner {
   // Debug functions
   void DumpGraph(const std::string &suffix);
   std::string DebugString() const;
+  bool JudgeUnknowShapeWithAttr(const OpDescPtr &opdesc);
   // Util functions
   Status CollectSpreadUnknownShapeNodes(NodePtr node);
   Status IsUnknownShapeGraph(ge::ComputeGraphPtr graph, bool &is_unknow);
diff --git a/src/ge/graph/partition/graph_partition.cc b/src/ge/graph/partition/graph_partition.cc
index b280074e..e6c7e64f 100644
--- a/src/ge/graph/partition/graph_partition.cc
+++ b/src/ge/graph/partition/graph_partition.cc
@@ -15,11 +15,14 @@
  */
 
 #include "graph/partition/graph_partition.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
+#include "analyzer/analyzer.h"
 #include "common/ge/ge_util.h"
 #include "common/op/ge_op_utils.h"
 #include "framework/common/types.h"
@@ -149,18 +152,22 @@ Status ge::GraphPartitioner::RemoveNodeAndEdgeBetweenEndPld(ge::ComputeGraphPtr
 
 Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr &output_merged_compute_graph,
                                                             const ge::ComputeGraphPtr &original_compute_graph) {
+  Status real_ret = SUCCESS;
   auto ret = MergeSubGraph(output_merged_compute_graph, original_compute_graph);
   if (ret != SUCCESS) {
+    // even though failed, ensure all op do finish check support
+    real_ret = FAILED;
     GELOGE(ret, "Graph merging Failed");
-    return ret;
   }
+  GE_CHECK_NOTNULL(original_compute_graph);
   // partition sub graph
   for (const auto &sub_graph : original_compute_graph->GetAllSubgraphs()) {
     ComputeGraphPtr merged_sub_graph = nullptr;
     ret = MergeSubGraph(merged_sub_graph, sub_graph);
     if (ret != SUCCESS) {
+      real_ret = FAILED;
       GELOGE(ret, "Sub graph merging Failed");
-      return ret;
+      continue;
     }
     // add sub graph
     output_merged_compute_graph->SetName(original_compute_graph->GetName());
@@ -177,8 +184,7 @@ Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr
                     return FAILED;)
     auto graph_info = graph_2_graph_partition_info_[original_graph];
     GE_IF_BOOL_EXEC(
-      graph_info.corresponding_node_in_partitions_.find(parent_node) ==
-        graph_info.corresponding_node_in_partitions_.end(),
+      graph_info.corresponding_node_in_partitions_.count(parent_node) == 0,
       GELOGE(FAILED, "Find corresponding node failed, parent node name is %s", parent_node->GetName().c_str());
       return FAILED;)
     auto corresponding_node = graph_info.corresponding_node_in_partitions_[parent_node];
@@ -191,9 +197,13 @@ Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr
     ret = output_merged_compute_graph->AddSubgraph(sub_graph->GetName(), merged_sub_graph);
     GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, return ret;)
   }
-  graph_2_graph_partition_info_.clear();
-  graph_2_subgraph_list_.clear();
-  return SUCCESS;
+  ClearAllPartitionData();
+  if (real_ret != SUCCESS) {
+    auto root_graph = ge::GraphUtils::FindRootGraph(original_compute_graph);
+    GE_CHECK_NOTNULL(root_graph);
+    (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(), root_graph->GetGraphID());
+  }
+  return real_ret;
 }
 
 Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph,
@@ -834,22 +844,28 @@ bool ge::GraphPartitioner::HasSecondPath(size_t src, size_t dst, size_t upper_bo
 }
 
 Status ge::GraphPartitioner::Partition(ge::ComputeGraphPtr compute_graph, Mode mode) {
-  graph_2_graph_partition_info_.clear();
-  graph_2_subgraph_list_.clear();
+  ClearAllPartitionData();
+  auto real_ret = SUCCESS;
   auto ret = PartitionSubGraph(compute_graph, mode);
   if (ret != SUCCESS) {
     GELOGE(ret, "Sub graph partition Failed");
-    return ret;
+    real_ret = ret;
   }
+  GE_CHECK_NOTNULL(compute_graph);
   // partition sub graph
   for (const auto &sub_graph : compute_graph->GetAllSubgraphs()) {
     ret = PartitionSubGraph(sub_graph, mode);
     if (ret != SUCCESS) {
       GELOGE(ret, "Sub graph partition Failed");
-      return ret;
+      real_ret = ret;
     }
   }
-  return SUCCESS;
+  if (real_ret != SUCCESS) {
+    auto root_graph = ge::GraphUtils::FindRootGraph(compute_graph);
+    GE_CHECK_NOTNULL(root_graph);
+    (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(), root_graph->GetGraphID());
+  }
+  return real_ret;
 }
 
 Status ge::GraphPartitioner::PartitionSubGraph(ge::ComputeGraphPtr compute_graph, Mode mode) {
@@ -1037,4 +1053,12 @@ void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPt
 }
 
 const Graph2SubGraphInfoList &ge::GraphPartitioner::GetSubGraphMap() { return graph_2_subgraph_list_; }
+
+void ge::GraphPartitioner::ClearAllPartitionData() {
+  graph_2_graph_partition_info_.clear();
+  graph_2_subgraph_list_.clear();
+  graph_2_input_subgraph_.clear();
+  GELOGD("Clear all partition data success.");
+  return;
+}
 }  // namespace ge
diff --git a/src/ge/graph/partition/graph_partition.h b/src/ge/graph/partition/graph_partition.h
index a363bd9d..c4425355 100644
--- a/src/ge/graph/partition/graph_partition.h
+++ b/src/ge/graph/partition/graph_partition.h
@@ -131,7 +131,7 @@ class GraphPartitioner {
   Status UpdatePldOpDesc(const NodePtr &dst_node, int input_index, OpDescPtr &end_op_desc);
 
   // Clear partition data
-  void ClearAllPartitionData(Mode mode);
+  void ClearAllPartitionData();
   void SetMergedGraphId(ComputeGraphPtr &output_merged_compute_graph);
 
   struct GraphPartitionInfo {
diff --git a/src/ge/graph/partition/stage_partition.cc b/src/ge/graph/partition/stage_partition.cc
new file mode 100644
index 00000000..51322de0
--- /dev/null
+++ b/src/ge/graph/partition/stage_partition.cc
@@ -0,0 +1,376 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/partition/stage_partition.h"
+
+#include <stack>
+#include "framework/common/debug/ge_log.h"
+#include "graph/debug/ge_attr_define.h"
+#include "graph/utils/graph_utils.h"
+#include "graph/utils/op_desc_utils.h"
+#include "common/util.h"
+#include "common/types.h"
+
+namespace ge {
+Status StagePartitioner::Partition() {
+  GE_CHECK_NOTNULL(root_graph_);
+  if (root_graph_->GetParentGraph() != nullptr) {
+    return SUCCESS;
+  }
+
+  for (const auto &node : root_graph_->GetDirectNode()) {
+    auto op_desc = node->GetOpDesc();
+    uint32_t level = 0;
+    if (!AttrUtils::GetInt(op_desc, ATTR_STAGE_LEVEL, level)) {
+      continue;
+    }
+    stage_nodes_[level].insert(node);
+  }
+  if (stage_nodes_.empty()) {
+    GELOGI("Graph %s does not set stage_level, it is not_changed.", root_graph_->GetName().c_str());
+    return SUCCESS;
+  }
+
+  if (SplitStageLevel() != SUCCESS) {
+    GELOGE(FAILED, "Split graph-stage for graph %s failed.", root_graph_->GetName().c_str());
+    return FAILED;
+  }
+
+  if (StagePartition() != SUCCESS) {
+    GELOGE(FAILED, "Stage partition for graph %s failed.", root_graph_->GetName().c_str());
+    return FAILED;
+  }
+
+  if (root_graph_->TopologicalSorting() != GRAPH_SUCCESS) {
+    GELOGE(FAILED,
+           "Topological sort for graph %s after stage partition failed, "
+           "maybe stage_level was not set correctly.",
+           root_graph_->GetName().c_str());
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+Status StagePartitioner::SplitStageLevel() {
+  std::stack<NodePtr> nodes;
+  std::unordered_set<NodePtr> visited_stage_nodes;
+  for (auto &stage : stage_nodes_) {
+    uint32_t cur_stage_level = stage.first;
+    const auto &cur_stage_nodes = stage.second;
+    for (const auto &marked_node : cur_stage_nodes) {
+      nodes.push(marked_node);
+    }
+    visited_stage_nodes.clear();
+    while (!nodes.empty()) {
+      auto node = nodes.top();
+      nodes.pop();
+      GE_CHECK_NOTNULL(node->GetOpDesc());
+      if (node->GetOpDesc()->HasAttr(ATTR_STAGE_LEVEL) && (cur_stage_nodes.count(node) == 0)) {
+        continue;
+      }
+      for (const auto &in_node : node->GetInAllNodes()) {
+        if (visited_stage_nodes.count(in_node) != 0) {
+          continue;
+        }
+        nodes.push(in_node);
+      }
+      if (!AttrUtils::SetInt(node->GetOpDesc(), ATTR_STAGE_LEVEL, cur_stage_level)) {
+        GELOGE(INTERNAL_ERROR, "Set attr ATTR_STAGE_LEVEL on node %s failed.", node->GetName().c_str());
+        return INTERNAL_ERROR;
+      }
+      GELOGD("Mark stage_level node %s, stage_level=%u", node->GetName().c_str(), cur_stage_level);
+      visited_stage_nodes.emplace(node);
+    }
+    for (const auto &node : visited_stage_nodes) {
+      stage.second.insert(node);
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status StagePartitioner::StagePartition() {
+  for (const auto &stage : stage_nodes_) {
+    StageInfo stage_info(stage.first);
+    FindStageIO(stage.second, stage_info);
+
+    std::string subgraph_name = "Subgraph_Level_" + std::to_string(stage.first);
+    NodePtr graph_node = BuildSubgraphNode(subgraph_name, stage_info);
+    if (graph_node == nullptr) {
+      GELOGE(FAILED, "Build PartitionedCall node for stage %u failed.", stage.first);
+      return FAILED;
+    }
+
+    ComputeGraphPtr subgraph = BuildStageGraph(graph_node, stage_info);
+    if (subgraph == nullptr) {
+      GELOGE(FAILED, "Build subgraph for stage %u failed.", stage.first);
+      return FAILED;
+    }
+    if (root_graph_->AddSubgraph(subgraph) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "Add subgraph of stage %u failed.", stage.first);
+      return FAILED;
+    }
+
+    if ((RelinkDataEdges(graph_node, stage_info) != SUCCESS) || (RelinkCtrlEdges(graph_node, stage_info) != SUCCESS)) {
+      GELOGE(FAILED, "Relink edges for stage %u failed.", stage.first);
+      return FAILED;
+    }
+
+    for (const auto &stage_node : stage.second) {
+      if (GraphUtils::RemoveNodeWithoutRelink(root_graph_, stage_node) != GRAPH_SUCCESS) {
+        GELOGW("Remove node %s failed.", stage_node->GetName().c_str());
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+void StagePartitioner::FindStageIO(const std::unordered_set<NodePtr> &stage_nodes, StageInfo &stage_info) {
+  for (const auto &node : stage_nodes) {
+    // stage nodes
+    stage_info.stage_nodes.emplace(node);
+    // in data nodes
+    for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+      OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+      if (peer_out_anchor == nullptr) {
+        continue;
+      }
+      if (stage_nodes.count(peer_out_anchor->GetOwnerNode()) == 0) {
+        stage_info.data_inputs.emplace_back(std::make_pair(peer_out_anchor, in_data_anchor));
+      } else {
+        stage_info.inner_data_edges.emplace_back(std::make_pair(peer_out_anchor, in_data_anchor));
+      }
+    }
+    // out data nodes
+    std::list<InDataAnchorPtr> peer_data_anchors;
+    for (const auto &out_data_anchor : node->GetAllOutDataAnchors()) {
+      peer_data_anchors.clear();
+      for (const auto &peer_in_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+        if (stage_nodes.count(peer_in_anchor->GetOwnerNode()) == 0) {
+          peer_data_anchors.emplace_back(peer_in_anchor);
+        }
+      }
+      if (!peer_data_anchors.empty()) {
+        stage_info.data_outputs.emplace_back(std::make_pair(out_data_anchor, peer_data_anchors));
+      }
+    }
+    // in ctrl nodes
+    for (const auto &in_ctrl_node : node->GetInControlNodes()) {
+      if (stage_nodes.count(in_ctrl_node) == 0) {
+        stage_info.ctrl_inputs.emplace_back(in_ctrl_node->GetOutControlAnchor(), node->GetInControlAnchor());
+      } else {
+        stage_info.inner_ctrl_edges.emplace_back(
+          std::make_pair(in_ctrl_node->GetOutControlAnchor(), node->GetInControlAnchor()));
+      }
+    }
+    // out ctrl nodes
+    for (const auto &out_ctrl_node : node->GetOutControlNodes()) {
+      if (stage_nodes.count(out_ctrl_node) == 0) {
+        stage_info.ctrl_outputs.emplace_back(node->GetOutControlAnchor(), out_ctrl_node->GetInControlAnchor());
+      }
+    }
+  }
+}
+
+NodePtr StagePartitioner::BuildSubgraphNode(const std::string &graph_name, const StageInfo &stage_info) {
+  OpDescBuilder op_desc_builder(graph_name, PARTITIONEDCALL);
+  size_t input_num = stage_info.data_inputs.size();
+  for (size_t i = 0; i < input_num; i++) {
+    auto input_desc = stage_info.data_inputs[i].second->GetOwnerNode()->GetOpDesc();
+    if (input_desc == nullptr) {
+      GELOGE(PARAM_INVALID, "op_desc is null, node: %s",
+             stage_info.data_inputs[i].second->GetOwnerNode()->GetName().c_str());
+      return nullptr;
+    }
+    op_desc_builder.AddInput("args" + std::to_string(i),
+                             input_desc->GetInputDesc(stage_info.data_inputs[i].second->GetIdx()));
+  }
+  size_t output_num = stage_info.data_outputs.size();
+  for (size_t i = 0; i < output_num; i++) {
+    auto output_desc = stage_info.data_outputs[i].first->GetOwnerNode()->GetOpDesc();
+    if (output_desc == nullptr) {
+      GELOGE(PARAM_INVALID, "op_desc is null, node: %s",
+             stage_info.data_outputs[i].first->GetOwnerNode()->GetName().c_str());
+      return nullptr;
+    }
+    op_desc_builder.AddOutput("output" + std::to_string(i),
+                              output_desc->GetOutputDesc(stage_info.data_outputs[i].first->GetIdx()));
+  }
+
+  OpDescPtr op_desc = op_desc_builder.Build();
+  if (op_desc == nullptr) {
+    GELOGE(FAILED, "Create op_desc for subgraph node failed, name:%s.", graph_name.c_str());
+    return nullptr;
+  }
+
+  op_desc->AddSubgraphName("f");
+  op_desc->SetSubgraphInstanceName(0, graph_name);
+
+  NodePtr subgraph_node = root_graph_->AddNode(op_desc);
+  if (subgraph_node == nullptr) {
+    GELOGE(FAILED, "Add node %s failed.", graph_name.c_str());
+    return nullptr;
+  }
+  if (subgraph_node->SetOwnerComputeGraph(root_graph_) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Set owner graph for node %s failed.", subgraph_node->GetName().c_str());
+    return nullptr;
+  }
+
+  return subgraph_node;
+}
+
+ComputeGraphPtr StagePartitioner::BuildStageGraph(const NodePtr &subgraph_node, const StageInfo &stage_info) {
+  CompleteGraphBuilder graph_builder(subgraph_node->GetName(), false);
+  // Add parent node
+  graph_builder.SetParentNode(subgraph_node);
+
+  // Add node
+  for (const auto &node : stage_info.stage_nodes) {
+    graph_builder.AddNode(AttrUtils::CopyOpDesc(node->GetOpDesc()));
+  }
+
+  // Set Input
+  size_t data_input_num = stage_info.data_inputs.size();
+  for (size_t i = 0; i < data_input_num; i++) {
+    graph_builder.SetInput(i, {stage_info.data_inputs[i].second->GetOwnerNode()->GetName()},
+                           {static_cast<uint32_t>(stage_info.data_inputs[i].second->GetIdx())});
+  }
+
+  // Add Outputs
+  size_t data_output_num = stage_info.data_outputs.size();
+  for (uint32_t i = 0; i < data_output_num; i++) {
+    graph_builder.AddOutput(stage_info.data_outputs[i].first->GetOwnerNode()->GetName(),
+                            stage_info.data_outputs[i].first->GetIdx());
+  }
+
+  // Add Data Edges
+  for (const auto &data_edge : stage_info.inner_data_edges) {
+    graph_builder.AddDataLink(data_edge.first->GetOwnerNode()->GetName(), data_edge.first->GetIdx(),
+                              data_edge.second->GetOwnerNode()->GetName(), data_edge.second->GetIdx());
+  }
+
+  // Add Ctrl Edges
+  for (const auto &ctrl_edge : stage_info.inner_ctrl_edges) {
+    graph_builder.AddControlLink(ctrl_edge.first->GetOwnerNode()->GetName(),
+                                 ctrl_edge.second->GetOwnerNode()->GetName());
+  }
+
+  // Add Input-Mapping
+  std::map<uint32_t, uint32_t> input_mapping;
+  for (size_t i = 0; i < data_input_num; i++) {
+    input_mapping[i] = i;
+  }
+  graph_builder.SetInputMapping(input_mapping);
+
+  // Add outputMapping
+  std::map<uint32_t, uint32_t> output_mapping;
+  for (size_t i = 0; i < data_output_num; i++) {
+    output_mapping[i] = i;
+  }
+  graph_builder.SetOutputMapping(output_mapping);
+
+  graphStatus error_code = GRAPH_SUCCESS;
+  std::string error_msg;
+  ComputeGraphPtr subgraph = graph_builder.Build(error_code, error_msg);
+  if (subgraph == nullptr) {
+    GELOGE(error_code, "Build subgraph %s failed: %s.", subgraph_node->GetName().c_str(), error_msg.c_str());
+    return nullptr;
+  }
+  if (!AttrUtils::SetInt(subgraph, ATTR_STAGE_LEVEL, stage_info.stage_level)) {
+    GELOGE(FAILED, "Set ATTR_STAGE_LEVEL on graph %s failed.", subgraph->GetName().c_str());
+    return nullptr;
+  }
+
+  return subgraph;
+}
+
+Status StagePartitioner::RelinkDataEdges(const NodePtr &subgraph_node, const StageInfo &stage_info) {
+  // in data nodes
+  for (size_t i = 0; i < stage_info.data_inputs.size(); i++) {
+    if (stage_info.data_inputs[i].first->Unlink(stage_info.data_inputs[i].second) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Remove data edge %s:%d->%s:%d failed.",
+             stage_info.data_inputs[i].first->GetOwnerNode()->GetName().c_str(),
+             stage_info.data_inputs[i].first->GetIdx(),
+             stage_info.data_inputs[i].second->GetOwnerNode()->GetName().c_str(),
+             stage_info.data_inputs[i].second->GetIdx());
+      return INTERNAL_ERROR;
+    }
+    if (stage_info.data_inputs[i].first->LinkTo(subgraph_node->GetInDataAnchor(i)) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Add data edge %s:%d->%s:%zu failed.",
+             stage_info.data_inputs[i].first->GetOwnerNode()->GetName().c_str(),
+             stage_info.data_inputs[i].first->GetIdx(), subgraph_node->GetName().c_str(), i);
+      return INTERNAL_ERROR;
+    }
+  }
+  // out data nodes
+  for (size_t i = 0; i < stage_info.data_outputs.size(); i++) {
+    const auto &out_data_anchor = subgraph_node->GetOutDataAnchor(i);
+    GE_CHECK_NOTNULL(out_data_anchor);
+    for (const auto &peer_in_anchor : stage_info.data_outputs[i].second) {
+      if (stage_info.data_outputs[i].first->Unlink(peer_in_anchor) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Remove data edge %s:%d->%s:%d failed.",
+               stage_info.data_outputs[i].first->GetOwnerNode()->GetName().c_str(),
+               stage_info.data_outputs[i].first->GetIdx(), peer_in_anchor->GetOwnerNode()->GetName().c_str(),
+               peer_in_anchor->GetIdx());
+        return INTERNAL_ERROR;
+      }
+      if (out_data_anchor->LinkTo(peer_in_anchor) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Add data edge %s:%zu->%s:%d failed.", subgraph_node->GetName().c_str(), i,
+               peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx());
+        return INTERNAL_ERROR;
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status StagePartitioner::RelinkCtrlEdges(const NodePtr &subgraph_node, const StageInfo &stage_info) {
+  // in ctrl nodes
+  for (const auto &ctrl_input : stage_info.ctrl_inputs) {
+    if (ctrl_input.first->Unlink(ctrl_input.second) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Remove ctrl edge %s->%s failed.", ctrl_input.first->GetOwnerNode()->GetName().c_str(),
+             ctrl_input.second->GetOwnerNode()->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+    if (!ctrl_input.first->IsLinkedWith(subgraph_node->GetInControlAnchor())) {
+      if (ctrl_input.first->LinkTo(subgraph_node->GetInControlAnchor()) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Add ctrl edge %s->%s failed.", ctrl_input.first->GetOwnerNode()->GetName().c_str(),
+               subgraph_node->GetName().c_str());
+        return INTERNAL_ERROR;
+      }
+    }
+  }
+  // out ctrl nodes
+  for (const auto &ctrl_output : stage_info.ctrl_outputs) {
+    if (ctrl_output.first->Unlink(ctrl_output.second) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Remove ctrl edge %s->%s failed.", ctrl_output.first->GetOwnerNode()->GetName().c_str(),
+             ctrl_output.second->GetOwnerNode()->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+    if (!subgraph_node->GetOutControlAnchor()->IsLinkedWith(ctrl_output.second)) {
+      if (subgraph_node->GetOutControlAnchor()->LinkTo(ctrl_output.second) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Add ctrl edge %s->%s failed.", subgraph_node->GetName().c_str(),
+               ctrl_output.second->GetOwnerNode()->GetName().c_str());
+        return INTERNAL_ERROR;
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/src/ge/graph/partition/stage_partition.h b/src/ge/graph/partition/stage_partition.h
new file mode 100644
index 00000000..d8364f0d
--- /dev/null
+++ b/src/ge/graph/partition/stage_partition.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_PARTITION_STAGE_PARTITION_H_
+#define GE_GRAPH_PARTITION_STAGE_PARTITION_H_
+
+#include <map>
+#include <unordered_set>
+#include <list>
+#include <utility>
+#include "common/ge_inner_error_codes.h"
+#include "graph/compute_graph.h"
+
+namespace ge {
+struct StageInfo {
+  explicit StageInfo(uint32_t level) : stage_level(level) {}
+  uint32_t stage_level;
+  std::unordered_set<NodePtr> stage_nodes;
+  std::vector<std::pair<OutDataAnchorPtr, InDataAnchorPtr>> data_inputs;
+  std::vector<std::pair<OutDataAnchorPtr, std::list<InDataAnchorPtr>>> data_outputs;
+  std::list<std::pair<OutControlAnchorPtr, InControlAnchorPtr>> ctrl_inputs;
+  std::list<std::pair<OutControlAnchorPtr, InControlAnchorPtr>> ctrl_outputs;
+  std::list<std::pair<OutDataAnchorPtr, InDataAnchorPtr>> inner_data_edges;
+  std::list<std::pair<OutControlAnchorPtr, InControlAnchorPtr>> inner_ctrl_edges;
+};
+
+class StagePartitioner {
+ public:
+  explicit StagePartitioner(ComputeGraphPtr graph) : root_graph_(std::move(graph)) {}
+  ~StagePartitioner() = default;
+
+  Status Partition();
+
+ private:
+  Status SplitStageLevel();
+
+  Status StagePartition();
+
+  static void FindStageIO(const std::unordered_set<NodePtr> &stage_nodes, StageInfo &stage_info);
+
+  NodePtr BuildSubgraphNode(const std::string &graph_name, const StageInfo &stage_info);
+
+  static ComputeGraphPtr BuildStageGraph(const NodePtr &subgraph_node, const StageInfo &stage_info);
+
+  static Status RelinkDataEdges(const NodePtr &subgraph_node, const StageInfo &stage_info);
+
+  static Status RelinkCtrlEdges(const NodePtr &subgraph_node, const StageInfo &stage_info);
+
+  ComputeGraphPtr root_graph_;
+  std::map<uint32_t, std::unordered_set<NodePtr>> stage_nodes_;
+};
+}  // namespace ge
+
+#endif  // GE_GRAPH_PARTITION_STAGE_PARTITION_H_
diff --git a/src/ge/graph/passes/aicpu_constant_folding_pass.cc b/src/ge/graph/passes/aicpu_constant_folding_pass.cc
index 4157b5d6..be7f79c4 100644
--- a/src/ge/graph/passes/aicpu_constant_folding_pass.cc
+++ b/src/ge/graph/passes/aicpu_constant_folding_pass.cc
@@ -29,9 +29,10 @@
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/type_utils.h"
 #include "init/gelib.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 namespace {
-const char *const kKernelLibName = "aicpu_kernel";
+const char *const kKernelLibName = "aicpu_tf_kernel";
 const char *const kNotSupported = "0";
 const uint64_t kReleaseFlag = 1;
 const uint64_t kOpsFlag = 1;
@@ -314,8 +315,8 @@ Status AicpuConstantFoldingPass::LaunchSingleOpRunTask(const NodePtr &node, cons
     GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized");
     return GE_CLI_GE_NOT_INITIALIZED;
   }
-  OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName);
-  if (kernel_info == nullptr) {
+  auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName);
+  if (kernel_builder == nullptr) {
     GELOGE(FAILED, "Get op kernel info store failed");
     return FAILED;
   }
@@ -325,7 +326,7 @@ Status AicpuConstantFoldingPass::LaunchSingleOpRunTask(const NodePtr &node, cons
   aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
   aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
   std::string task_info;
-  Status ret = kernel_info->GenSingleOpRunTask(node, aicpu_task, task_info);
+  Status ret = kernel_builder->GenSingleOpRunTask(node, aicpu_task, task_info);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -369,8 +370,8 @@ Status AicpuConstantFoldingPass::LaunchMemCopyTask(const vector<uint64_t> &data_
     GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized");
     return GE_CLI_GE_NOT_INITIALIZED;
   }
-  OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName);
-  if (kernel_info == nullptr) {
+  auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName);
+  if (kernel_builder == nullptr) {
     GELOGE(FAILED, "Get op kernel info store failed");
     return FAILED;
   }
@@ -380,7 +381,7 @@ Status AicpuConstantFoldingPass::LaunchMemCopyTask(const vector<uint64_t> &data_
   aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
   aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
   std::string task_info;
-  Status ret = kernel_info->GenMemCopyTask(data_infos.size(), aicpu_task, task_info);
+  Status ret = kernel_builder->GenMemCopyTask(data_infos.size(), aicpu_task, task_info);
   if (ret != SUCCESS) {
     return ret;
   }
diff --git a/src/ge/graph/passes/atomic_addr_clean_pass.cc b/src/ge/graph/passes/atomic_addr_clean_pass.cc
index 2c7fb9bb..1baa9baa 100644
--- a/src/ge/graph/passes/atomic_addr_clean_pass.cc
+++ b/src/ge/graph/passes/atomic_addr_clean_pass.cc
@@ -50,8 +50,8 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
     return SUCCESS;
   }
 
-  bool is_known_graph = graph->GetGraphUnknownFlag();
-  if (is_known_graph) {
+  bool is_unknown_graph = graph->GetGraphUnknownFlag();
+  if (is_unknown_graph) {
     GELOGD("Graph[%s] is unknown graph. It will call fe interface to compile op.", graph->GetName().c_str());
     GE_CHK_STATUS_RET(CompileUnknownGraphOp(atomic_node_vec));
     return SUCCESS;
diff --git a/src/ge/graph/passes/compile_nodes_pass.cc b/src/ge/graph/passes/compile_nodes_pass.cc
index a93671c7..037cc332 100644
--- a/src/ge/graph/passes/compile_nodes_pass.cc
+++ b/src/ge/graph/passes/compile_nodes_pass.cc
@@ -30,7 +30,7 @@ using domi::ImplyType;
 
 namespace {
 const char *const kAICPUEngineName = "DNN_VM_AICPU";
-const char *const kAICPUKernelLibName = "aicpu_kernel";
+const char *const kAICPUKernelLibName = "aicpu_tf_kernel";
 }  // namespace
 
 namespace ge {
diff --git a/src/ge/graph/passes/constant_folding_pass.cc b/src/ge/graph/passes/constant_folding_pass.cc
index 80bf7867..95eba490 100644
--- a/src/ge/graph/passes/constant_folding_pass.cc
+++ b/src/ge/graph/passes/constant_folding_pass.cc
@@ -17,19 +17,36 @@
 #include "graph/passes/constant_folding_pass.h"
 
 #include <vector>
-
-#include "common/debug/log.h"
-#include "common/types.h"
-#include "framework/common/debug/ge_log.h"
 #include "graph/operator_factory.h"
-#include "graph/utils/attr_utils.h"
 #include "graph/utils/node_utils.h"
-#include "graph/utils/op_desc_utils.h"
 #include "graph/utils/type_utils.h"
-#include "inc/kernel.h"
+#include "init/gelib.h"
 
 namespace ge {
 const int64_t kStartCallNum = 1;
+const std::string kKernelLibName = "aicpu_tf_kernel";
+// tf_kernel.json opsFlag config
+const std::string kOpsFlagClose = "0";
+
+Status RunOpKernelWithCheck(NodePtr &node, const vector<ConstGeTensorPtr> &inputs, std::vector<GeTensorPtr> &outputs) {
+  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
+    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized or is finalized.");
+    return UNSUPPORTED;
+  }
+  OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName);
+  if (kernel_info == nullptr) {
+    GELOGE(FAILED, "Get op kernel info store %s failed", kKernelLibName.c_str());
+    return UNSUPPORTED;
+  }
+
+  std::string ops_flag;
+  kernel_info->opsFlagCheck(*node, ops_flag);
+  if (ops_flag == kOpsFlagClose) {
+    return UNSUPPORTED;
+  }
+  return FoldingPass::RunOpKernel(node, inputs, outputs);
+}
 
 const std::unordered_map<std::string, std::pair<std::uint64_t, uint64_t>>
   &ConstantFoldingPass::GetGeConstantFoldingPerfStatistic() const {
@@ -63,8 +80,8 @@ Status ConstantFoldingPass::Run(ge::NodePtr &node) {
   auto inputs = OpDescUtils::GetInputData(input_nodes);
   vector<GeTensorPtr> outputs;
   // Statistic of ge constant folding kernel
-  uint64_t start_time = GetCurrentTimestap();
-  auto ret = RunOpKernel(node, inputs, outputs);
+  uint64_t start_time = GetCurrentTimestamp();
+  auto ret = RunOpKernelWithCheck(node, inputs, outputs);
   if (ret != SUCCESS) {
     auto op_kernel = folding_pass::GetKernelByType(node);
     if (op_kernel == nullptr) {
@@ -74,9 +91,9 @@ Status ConstantFoldingPass::Run(ge::NodePtr &node) {
     }
 
     // Statistic of op and fe constant folding kernel
-    start_time = GetCurrentTimestap();
+    start_time = GetCurrentTimestamp();
     ret = op_kernel->Compute(node_desc, inputs, outputs);
-    uint64_t cost_time = GetCurrentTimestap() - start_time;
+    uint64_t cost_time = GetCurrentTimestamp() - start_time;
     if (statistic_of_ge_constant_folding_.find(node->GetType()) != statistic_of_ge_constant_folding_.end()) {
       uint64_t &cnt = statistic_of_ge_constant_folding_[node->GetType()].first;
       uint64_t &cur_cost_time = statistic_of_ge_constant_folding_[node->GetType()].second;
@@ -100,10 +117,10 @@ Status ConstantFoldingPass::Run(ge::NodePtr &node) {
       uint64_t &cnt = statistic_of_op_constant_folding_[node->GetType()].first;
       uint64_t &cost_time = statistic_of_op_constant_folding_[node->GetType()].second;
       cnt++;
-      cost_time += GetCurrentTimestap() - start_time;
+      cost_time += GetCurrentTimestamp() - start_time;
     } else {
       statistic_of_op_constant_folding_[node->GetType()] =
-        std::pair<uint64_t, uint64_t>(kStartCallNum, GetCurrentTimestap() - start_time);
+        std::pair<uint64_t, uint64_t>(kStartCallNum, GetCurrentTimestamp() - start_time);
     }
   }
 
diff --git a/src/ge/graph/passes/ctrl_edge_transfer_pass.cc b/src/ge/graph/passes/ctrl_edge_transfer_pass.cc
index 9454c00d..6c426e95 100644
--- a/src/ge/graph/passes/ctrl_edge_transfer_pass.cc
+++ b/src/ge/graph/passes/ctrl_edge_transfer_pass.cc
@@ -20,6 +20,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/util.h"
 #include "graph/utils/graph_utils.h"
+#include "graph/debug/ge_attr_define.h"
 
 namespace ge {
 /* Pass Explaination:
@@ -42,6 +43,12 @@ Status CtrlEdgeTransferPass::Run(ge::ComputeGraphPtr graph) {
   GELOGD("CtrlEdgeTransferPass start running");
   GE_CHECK_NOTNULL(graph);
 
+  bool is_dynamic_shape = false;
+  (void)AttrUtils::GetBool(graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape);
+  if (!is_dynamic_shape) {
+    return SUCCESS;
+  }
+
   for (ge::NodePtr &n : graph->GetDirectNode()) {
     auto op_desc = n->GetOpDesc();
     if (op_desc == nullptr) {
diff --git a/src/ge/graph/passes/infershape_pass.cc b/src/ge/graph/passes/infershape_pass.cc
index cacca584..760168a7 100644
--- a/src/ge/graph/passes/infershape_pass.cc
+++ b/src/ge/graph/passes/infershape_pass.cc
@@ -21,6 +21,7 @@
 #include "analyzer/analyzer.h"
 #include "framework/common/util.h"
 #include "graph/shape_refiner.h"
+#include "graph/utils/graph_utils.h"
 
 namespace ge {
 Status InferShapePass::Run(NodePtr &node) {
@@ -29,9 +30,12 @@ Status InferShapePass::Run(NodePtr &node) {
     // select INFERSHAPE failed info
     auto graph = node->GetOwnerComputeGraph();
     GE_CHECK_NOTNULL(graph);
-    analyzer::DataInfo analyze_info{graph->GetSessionID(), graph->GetGraphID(), analyzer::INFER_SHAPE, node,
+    auto root_graph = ge::GraphUtils::FindRootGraph(graph);
+    GE_CHECK_NOTNULL(root_graph);
+    analyzer::DataInfo analyze_info{root_graph->GetSessionID(), root_graph->GetGraphID(), analyzer::INFER_SHAPE, node,
                                     "InferShapeFailed!"};
     (void)Analyzer::GetInstance()->DoAnalyze(analyze_info);
+    (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(), root_graph->GetGraphID());
 
     GELOGE(GE_GRAPH_INFERSHAPE_FAILED, "infershape failed. node: %s", node->GetName().c_str());
     return GE_GRAPH_INFERSHAPE_FAILED;
diff --git a/src/ge/graph/passes/mark_graph_unknown_status_pass.cc b/src/ge/graph/passes/mark_graph_unknown_status_pass.cc
index 7106e58c..2abec90b 100644
--- a/src/ge/graph/passes/mark_graph_unknown_status_pass.cc
+++ b/src/ge/graph/passes/mark_graph_unknown_status_pass.cc
@@ -16,17 +16,24 @@
 
 #include "graph/passes/mark_graph_unknown_status_pass.h"
 #include "graph/utils/node_utils.h"
+#include "graph/debug/ge_attr_define.h"
 
 namespace ge {
 Status MarkGraphUnknownStatusPass::Run(ComputeGraphPtr graph) {
   GE_CHECK_NOTNULL(graph);
   bool is_unknown_shape = false;
+  bool forced_unknown = false;
   for (const auto &node : graph->GetDirectNode()) {
     GE_CHK_STATUS_RET(ge::NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown_shape),
                       "Get node[%s] shape status failed!", node->GetName().c_str());
     if (is_unknown_shape) {
       break;
     }
+    if (AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_FORCE_UNKNOWN_SHAPE, forced_unknown) && forced_unknown) {
+      GELOGD("node %s was marked as unknown shape.", node->GetName().c_str());
+      is_unknown_shape = true;
+      break;
+    }
   }
   graph->SetGraphUnknownFlag(is_unknown_shape);
   GELOGD("mark graph [%s] unknown status success! value is %d", graph->GetName().c_str(), is_unknown_shape);
diff --git a/src/ge/graph/passes/multi_batch_clone_pass.cc b/src/ge/graph/passes/multi_batch_clone_pass.cc
index 80355ca7..c6d446af 100644
--- a/src/ge/graph/passes/multi_batch_clone_pass.cc
+++ b/src/ge/graph/passes/multi_batch_clone_pass.cc
@@ -18,6 +18,7 @@
 
 #include "common/formats/utils/formats_trans_utils.h"
 #include "common/ge/ge_util.h"
+#include "graph/common/local_context.h"
 #include "graph/preprocess/multi_batch_options.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/op_desc_utils.h"
@@ -33,6 +34,7 @@ const std::string kMultiBatchCaseNode = "ascend_mbatch_shape_case";
 const std::string kMultiBatchDataNode = "ascend_mbatch_shape_data";
 const std::string kMultiBatchConstNode = "ascend_mbatch_shape_const";
 const std::string kMultiBatchMapIndexNode = "ascend_mbatch_shape_mapindex";
+const std::string kMultiBatchNodePostfix = "_ascend_mbatch_batch_";
 }  // namespace
 
 Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
@@ -53,6 +55,13 @@ Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
     return INTERNAL_ERROR;
   }
 
+  // parser data dynamic info from atc parameter --input_shape
+  if (multibatch::ParserDataToDynmaicInfo(batch_shapes_, GetLocalOmgContext().user_input_dims, data_to_dynamic_info_) !=
+      SUCCESS) {
+    GELOGE(PARAM_INVALID, "Parse each data's own dynamic info failed");
+    return PARAM_INVALID;
+  }
+
   (void)AttrUtils::GetStr(graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_);
   ComputeGraphPtr branch = MakeShared<ComputeGraph>(graph->GetName());
   if (branch == nullptr) {
@@ -165,6 +174,14 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) {
     }
   }
 
+  std::vector<std::string> data_name_order;
+  for (auto &item : GetLocalOmgContext().user_input_dims) {
+    data_name_order.push_back(item.first);
+  }
+  if (!AttrUtils::SetListStr(op_desc, ATTR_USER_DESIGNEATE_SHAPE_ORDER, data_name_order)) {
+    GELOGE(FAILED, "Failed to add user designate shape order attr on case node %s", op_desc->GetName().c_str());
+    return FAILED;
+  }
   GE_CHK_STATUS_RET(multibatch::StampDynamicType(op_desc), "Set dynamic type failed");
 
   GE_CHK_STATUS_RET(CreateIndexNode(graph), "Create index node failed");
@@ -391,6 +408,7 @@ Status MultiBatchClonePass::CreateConstNode(const ComputeGraphPtr &graph) {
     // Const no InputDesc, Data need InputDesc.
     (void)op_desc->AddInputDesc(op_desc->GetOutputDesc(kDataOutIndex));
     (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index);
+    (void)NodeUtils::AppendInputAnchor(all_const_nodes_[i], 1);
   }
 
   all_const_nodes_.swap(all_const_nodes);
@@ -454,6 +472,7 @@ Status MultiBatchClonePass::CreateOutputNode(const ComputeGraphPtr &graph) {
 ///
 Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) {
   auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
+  auto data_name = data->GetName();
   const auto &dims = data_shape.GetDims();
   if (std::all_of(dims.begin(), dims.end(), [](int64_t val) { return val >= 0; })) {
     return SUCCESS;
@@ -464,9 +483,10 @@ Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) {
   int64_t max_size = 0;
   for (size_t i = 0; i < batch_shapes_.size(); ++i) {
     int64_t size = 1;
-    for (auto dim : batch_shapes_[i]) {
+    for (auto dim : data_to_dynamic_info_.at(data_name).at(i)) {
       if (INT64_MAX / dim < size) {
-        GELOGE(PARAM_INVALID, "The shape %s size overflow", formats::ShapeToString(batch_shapes_[i]).c_str());
+        GELOGE(PARAM_INVALID, "The shape %s size overflow",
+               formats::ShapeToString(data_to_dynamic_info_.at(data_name).at(i)).c_str());
         return PARAM_INVALID;
       }
       size *= dim;
@@ -477,17 +497,17 @@ Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) {
     }
   }
 
-  return SetShapeToData(batch_shapes_[max_shape_index], data, data_shape);
+  return SetShapeToData(data_to_dynamic_info_.at(data_name).at(max_shape_index), data, data_shape);
 }
 
 ///
 /// @ingroup ge
 /// @brief Set shape to Data node in branch.
 /// @param [in] const NodePtr &data: data in branch.
-/// @param [in] const std::vector<int64_t> &shapes: dims of shape.
+/// @param [in] size_t index: The batch index.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status MultiBatchClonePass::UpdataShapeToData(const NodePtr &data, const vector<int64_t> &shapes) {
+Status MultiBatchClonePass::UpdateShapeToData(const NodePtr &data, size_t index) {
   auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
   const auto &dims = data_shape.GetDims();
   if (std::all_of(dims.begin(), dims.end(), [](int64_t val) { return val >= 0; })) {
@@ -495,7 +515,16 @@ Status MultiBatchClonePass::UpdataShapeToData(const NodePtr &data, const vector<
   }
 
   (void)AttrUtils::SetListInt(data->GetOpDesc(), ATTR_MBATCH_ORIGIN_INPUT_DIMS, data_shape.GetDims());
-  return SetShapeToData(shapes, data, data_shape);
+  auto data_name = data->GetName();
+  size_t pos = data_name.find(kMultiBatchNodePostfix);
+  if (pos == string::npos) {
+    GELOGE(FAILED, "Cannot find key string [%s] of multi-batch in name of virtual input node, node name: %s.",
+           kMultiBatchNodePostfix.c_str(), data_name.c_str());
+    return FAILED;
+  }
+
+  auto parent_name = data_name.substr(0, pos);
+  return SetShapeToData(data_to_dynamic_info_.at(parent_name).at(index), data, data_shape);
 }
 
 ///
@@ -534,42 +563,38 @@ Status MultiBatchClonePass::SetShapeToData(const vector<int64_t> &shapes, const
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status MultiBatchClonePass::CreateSubgraphs(const ComputeGraphPtr &graph, const ComputeGraphPtr &branch) {
-  const std::string name = graph->GetName() + "_branche_";
   const auto &op_desc = case_node_->GetOpDesc();
   for (size_t i = 0; i < batch_shapes_.size(); ++i) {
     std::vector<NodePtr> input_nodes;
     std::vector<NodePtr> output_nodes;
-    const std::string prefix = "branche_" + std::to_string(i) + "_";
-    ComputeGraphPtr subgraph = (i == 0) ? branch : GraphUtils::CloneGraph(branch, prefix, input_nodes, output_nodes);
+    const std::string postfix = kMultiBatchNodePostfix + std::to_string(i);
+    ComputeGraphPtr subgraph = (i == 0) ? branch : GraphUtils::CloneGraph(branch, postfix, input_nodes, output_nodes);
     if (subgraph == nullptr) {
       GELOGE(FAILED, "Create multi-batch case node failed");
       return FAILED;
     }
 
-    subgraph->SetName(name + std::to_string(i));
+    subgraph->SetName("Batch_" + std::to_string(i));
     subgraph->SetParentNode(case_node_);
     subgraph->SetParentGraph(graph);
-    (void)AttrUtils::SetStr(subgraph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_);
-    all_branch_output_[subgraph] = subgraph->FindFirstNodeMatchType(NETOUTPUT);
-
     graph->AddSubgraph(subgraph->GetName(), subgraph);
+    all_branch_output_[subgraph] = subgraph->FindFirstNodeMatchType(NETOUTPUT);
 
-    const std::string key_name = "branches" + std::to_string(i);
+    const string key_name = "branches" + std::to_string(i);
     op_desc->AddSubgraphName(key_name);
     op_desc->SetSubgraphInstanceName(i, subgraph->GetName());
 
     for (const auto &data : input_nodes) {
-      GE_CHK_STATUS_RET(UpdataShapeToData(data, batch_shapes_[i]), "Update %s failed", subgraph->GetName().c_str());
+      GE_CHK_STATUS_RET(UpdateShapeToData(data, i), "Update %s failed", subgraph->GetName().c_str());
     }
   }
 
   // Origninal graph take as first subgraph, update node name.
   for (const auto &n : branch->GetDirectNode()) {
     const auto &op_desc = n->GetOpDesc();
-    op_desc->SetName("branche_0_" + n->GetName());
-
+    op_desc->SetName(n->GetName() + kMultiBatchNodePostfix + "0");
     if (n->GetType() == DATA) {
-      GE_CHK_STATUS_RET(UpdataShapeToData(n, batch_shapes_[0]), "Update %s failed", branch->GetName().c_str());
+      GE_CHK_STATUS_RET(UpdateShapeToData(n, 0), "Update %s failed", branch->GetName().c_str());
     }
   }
 
diff --git a/src/ge/graph/passes/multi_batch_clone_pass.h b/src/ge/graph/passes/multi_batch_clone_pass.h
index 0d52b738..454aff41 100644
--- a/src/ge/graph/passes/multi_batch_clone_pass.h
+++ b/src/ge/graph/passes/multi_batch_clone_pass.h
@@ -107,10 +107,10 @@ class MultiBatchClonePass : public GraphPass {
   /// @ingroup ge
   /// @brief Set shape to Data node in branch.
   /// @param [in] const NodePtr &data: data in branch.
-  /// @param [in] const std::vector<int64_t> &shapes: dims of shape.
+  /// @param [in] size_t index: The batch index.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status UpdataShapeToData(const NodePtr &data, const std::vector<int64_t> &shapes);
+  Status UpdateShapeToData(const NodePtr &data, size_t index);
 
   ///
   /// @ingroup ge
@@ -165,6 +165,7 @@ class MultiBatchClonePass : public GraphPass {
 
   std::map<uint32_t, std::string> direct_output_;
   std::map<ComputeGraphPtr, NodePtr> all_branch_output_;
+  std::map<string, vector<vector<int64_t>>> data_to_dynamic_info_;
 
   NodePtr case_node_;
 };
diff --git a/src/ge/graph/passes/multi_batch_pass.cc b/src/ge/graph/passes/multi_batch_pass.cc
index 32152a6f..3638f8a0 100644
--- a/src/ge/graph/passes/multi_batch_pass.cc
+++ b/src/ge/graph/passes/multi_batch_pass.cc
@@ -95,6 +95,34 @@ Status MultiBatchPass::ClearStatus() {
   return SUCCESS;
 }
 
+///
+/// @ingroup ge
+/// @brief Set batch label for Case mode.
+/// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+/// @param [in] const NodePtr &case_node: Case Node.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status MultiBatchPass::SetCaseLabel(const ComputeGraphPtr &graph, const NodePtr &case_node) {
+  const auto &func_desc = case_node->GetOpDesc();
+  if (!func_desc->HasAttr(ATTR_NAME_BATCH_NUM)) {
+    GELOGD("Graph: %s Not multi-batch, Node: %s", graph->GetName().c_str(), case_node->GetName().c_str());
+    return SUCCESS;
+  }
+
+  const auto &dynamic_branch_names = func_desc->GetSubgraphInstanceNames();
+  for (size_t i = 0; i < dynamic_branch_names.size(); ++i) {
+    const auto &subgraph = graph->GetSubgraph(dynamic_branch_names[i]);
+    GE_CHECK_NOTNULL(subgraph);
+
+    const string batch_label = "Batch_" + std::to_string(i);
+    for (const auto &node : subgraph->GetDirectNode()) {
+      (void)AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label);
+    }
+  }
+
+  return SUCCESS;
+}
+
 ///
 /// @brief Replace & Combine SwitchN nodes
 /// @param [in] graph
@@ -103,6 +131,10 @@ Status MultiBatchPass::ClearStatus() {
 ///
 Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchorPtr &pred_value) {
   for (const NodePtr &node : graph->GetDirectNode()) {
+    if (node->GetType() == CASE) {
+      GE_CHK_STATUS_RET(SetCaseLabel(graph, node), "Set batch label failed");
+      continue;
+    }
     if (node->GetType() != SWITCHN) {
       continue;
     }
diff --git a/src/ge/graph/passes/multi_batch_pass.h b/src/ge/graph/passes/multi_batch_pass.h
index 1806229f..bc93bc09 100644
--- a/src/ge/graph/passes/multi_batch_pass.h
+++ b/src/ge/graph/passes/multi_batch_pass.h
@@ -53,6 +53,15 @@ class MultiBatchPass : public GraphPass {
   Status AttachLabelOnly(uint32_t batch_num);
   Status GetUserDesignateShape();
 
+  ///
+  /// @ingroup ge
+  /// @brief Set batch label for Case mode.
+  /// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+  /// @param [in] const NodePtr &case_node: Case Node.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status SetCaseLabel(const ComputeGraphPtr &graph, const NodePtr &case_node);
+
   std::vector<NodePtr> switch_n_nodes_;
   std::vector<NodePtr> bypass_nodes_;
   std::vector<std::vector<NodePtr>> batch_head_nodes_;
diff --git a/src/ge/graph/passes/subexpression_migration_pass.cc b/src/ge/graph/passes/subexpression_migration_pass.cc
index c7f3845e..52e6b0c4 100644
--- a/src/ge/graph/passes/subexpression_migration_pass.cc
+++ b/src/ge/graph/passes/subexpression_migration_pass.cc
@@ -162,6 +162,7 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap
       }
 
       data_nodes[parent_index] = data;
+      GELOGD("%s, Parent index: %u, Data: %s", subgraph->GetName().c_str(), parent_index, data->GetName().c_str());
     }
   }
 
@@ -302,7 +303,7 @@ Status SubexpressionMigrationPass::GraphNodeMigration(const ComputeGraphPtr &gra
         continue;
       }
 
-      GELOGI("Move to parent: %s", base_node->GetName().c_str());
+      GELOGI("Move to parent: %s, parent index: %u", base_node->GetName().c_str(), base_idx);
       if (AppendParallelNode(graph_nodes, func_node, outputs) != SUCCESS) {
         return FAILED;
       }
@@ -335,12 +336,12 @@ Status SubexpressionMigrationPass::AppendParallelNode(map<ComputeGraphPtr, map<u
     }
 
     // Add Data to subgraph.
+    map<ComputeGraphPtr, uint32_t> append_num;
     for (auto &groups : graph_nodes) {
       const auto &subgraph = groups.first;
       auto &data_nodes = groups.second;
 
-      uint32_t data_index = data_nodes.size();
-      item.second = data_index + kCaseInputBase;  // Update to valid parent index.
+      item.second = func_node->GetAllInDataAnchorsSize() + append_num[subgraph];  // Update to valid parent index.
       std::string data_name = subgraph->GetName() + "_data_" + std::to_string(item.second);
 
       OpDescBuilder op_builder(data_name, DATA);
@@ -350,6 +351,7 @@ Status SubexpressionMigrationPass::AppendParallelNode(map<ComputeGraphPtr, map<u
         return OUT_OF_MEMORY;
       }
 
+      uint32_t data_index = item.second - kCaseInputBase;
       if (!AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
         GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
         return FAILED;
@@ -360,11 +362,13 @@ Status SubexpressionMigrationPass::AppendParallelNode(map<ComputeGraphPtr, map<u
         return FAILED;
       }
 
+      append_num[subgraph]++;
       data_nodes[item.second] = subgraph->AddNode(op_desc);
+      GELOGI("Add Node: %s, parent index: %u", op_desc->GetName().c_str(), item.second);
     }
 
     // Add InputTensor to functional Node.
-    NodeUtils::AppendInputAnchor(func_node, item.second + 1);
+    GE_CHK_GRAPH_STATUS_RET(NodeUtils::AppendInputAnchor(func_node, item.second + 1), "Append input failed");
     migration_append_ = true;
   }
 
diff --git a/src/ge/graph/passes/subgraph_const_migration_pass.cc b/src/ge/graph/passes/subgraph_const_migration_pass.cc
new file mode 100644
index 00000000..c62a712e
--- /dev/null
+++ b/src/ge/graph/passes/subgraph_const_migration_pass.cc
@@ -0,0 +1,570 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "subgraph_const_migration_pass.h"
+
+#include "graph/utils/node_utils.h"
+#include "ge_local_engine/engine/host_cpu_engine.h"
+#include "graph/passes/folding_pass.h"
+
+namespace ge {
+constexpr uint32_t kDataOutIndex = 0;
+constexpr uint32_t kCaseInputBase = 1;
+constexpr uint32_t kInvalidParent = 0x7fffffffU;
+
+bool IsSameOpNode(const NodePtr &src_node, const NodePtr &dst_node) {
+  if ((src_node == nullptr) && (dst_node == nullptr)) {
+    return true;
+  }
+
+  if ((src_node == nullptr) || (dst_node == nullptr)) {
+    return false;
+  }
+
+  if (src_node->GetType() != dst_node->GetType()) {
+    return false;
+  }
+
+  if ((src_node->GetInControlNodes().size() != dst_node->GetInControlNodes().size()) ||
+      (src_node->GetOutDataNodesSize() != dst_node->GetOutDataNodesSize())) {
+    return false;
+  }
+
+  set<uint32_t> related_parent;
+  const auto in_nodes = src_node->GetInControlNodes();
+  for (uint32_t i = 0; i < in_nodes.size(); ++i) {
+    const auto owner_node = in_nodes.at(i);
+    uint32_t parent_index = 0;
+    if (!AttrUtils::GetInt(owner_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+      return false;
+    }
+
+    related_parent.insert(parent_index);
+  }
+
+  for (const auto &in_node : dst_node->GetInControlNodes()) {
+    uint32_t parent_index = 0;
+    if (!AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+      return false;
+    }
+
+    if (related_parent.count(parent_index) == 0) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/***********************************************************************************************************************
+                                                                             +-----------+
+                                                                             |   Data    |
+                                                                             +-----------+
+                                                                                   |
+                                                                                   |
+                                                                             +-----------+
+                                                                             |   Cast    |
+                                                                             +-----------+
+                                                                                   |
+                                                                                   |
+                                                                             +-----------+ +-----------+ +-----------+
+                                                                             | TransData | |   Data    | |   Data    |
+                                                                             +-----------+ +-----------+ +-----------+
+                                                                                        \        |        /
+                                                                                         \       |       /
+                                                                                          \      |      /
+                                                                                           \     |     /
+ +-----------+ +-----------+ +-----------+ +-----------+ +-----------+    +-----------+    +-----------+
+ |   Data    | |   Data    | |   Data    | |   Data    | |   Data    |    |   Data    |    |  Conv2D   |
+ +-----------+ +-----------+ +-----------+ +-----------+ +-----------+    +-----------+    +-----------+
+        \                 \        |        /                  /                |                |
+         \                 \       |       /                  /                 |                |
+          \                 \      |      /                  /                  |                |
+           \                 \     |     /                  /                   |                |
+            \                +-----------+                 /                    |          +-----------+
+             +---------------|   Const   |----------------+                     |          |  Pooling  |
+                             +-----------+                                      |          +-----------+
+                                   \                                            |               /
+                                    \                                           |              /
+                                     \                                    +-----------+       /
+                                      +-----------------------------------|  Conv2D   |------+
+                                                                          +-----------+
+                                                                                |
+                                                                                |
+                                                                          +-----------+
+                                                                          |   Node    |
+                                                                          +-----------+
+***********************************************************************************************************************/
+Status SubgraphConstMigrationPass::Run(ComputeGraphPtr graph) {
+  GE_CHECK_NOTNULL(graph);
+  if (graph->GetParentGraph() != nullptr) {
+    GELOGD("Subgraph %s skip the SubgraphConstMigrationPass", graph->GetName().c_str());
+    return SUCCESS;
+  }
+
+  GELOGD("Begin to run Subgraph Const Migration on graph: %s", graph->GetName().c_str());
+  for (const auto &node : graph->GetDirectNode()) {
+    if (node->GetType() != CASE) {
+      continue;
+    }
+
+    const auto &func_desc = node->GetOpDesc();
+    if (!func_desc->HasAttr(ATTR_NAME_BATCH_NUM)) {
+      GELOGD("Not multi-batch, Skip Case: %s", node->GetName().c_str());
+      continue;
+    }
+
+    do {
+      migration_append_ = false;
+      map<ComputeGraphPtr, map<uint32_t, NodePtr>> graph_datas;
+      if (ClassifyDataNodes(graph, func_desc, graph_datas) != SUCCESS) {
+        return FAILED;
+      }
+
+      if (graph_datas.empty()) {
+        GELOGW("Graph: %s subgraph is empty", graph->GetName().c_str());
+        break;
+      }
+
+      // {subgraph0, {{1, Data}, {2, Data}, {3, Data}, {4, Data}, ..., {n, Data}}}
+      // {subgraph1, {{1, Data}, {2, Data}, {3, Data}, {4, Data}, ..., {n, Data}}}
+      // {subgraph2, {{1, Data}, {2, Data}, {3, Data}, {4, Data}, ..., {n, Data}}}
+      const auto base_nodes = graph_datas.begin()->second;  // Need copy.
+      for (const auto &node_item : base_nodes) {
+        if (GraphNodeMigration(graph, node, graph_datas, node_item.second, node_item.first) != SUCCESS) {
+          return FAILED;
+        }
+      }
+    } while (migration_append_);
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Get all Data nodes for all subgraph.
+/// @param [in] graph: Root compute graph.
+/// @param [in] func_desc: functional OpDesc of Case.
+/// @param [out] graph_datas: Data groups of subgraph.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status SubgraphConstMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &graph, const OpDescPtr &func_desc,
+                                                     map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas) {
+  for (const auto &name : func_desc->GetSubgraphInstanceNames()) {
+    const auto &subgraph = graph->GetSubgraph(name);
+    if (subgraph == nullptr) {
+      GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str());
+      return GE_GRAPH_EMPTY_SUBGRAPH;
+    }
+
+    auto &data_nodes = graph_datas[subgraph];
+    for (auto &data : subgraph->GetDirectNode()) {
+      if (data->GetType() != DATA) {
+        continue;
+      }
+
+      uint32_t parent_index = 0;
+      if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+        GELOGE(FAILED, "Parent index not found, name: %s", data->GetName().c_str());
+        return FAILED;
+      }
+
+      data_nodes[parent_index] = data;
+      GELOGD("%s, Parent index: %u, Data: %s", subgraph->GetName().c_str(), parent_index, data->GetName().c_str());
+    }
+  }
+
+  auto iter = graph_datas.begin();
+  if (iter == graph_datas.end()) {
+    return SUCCESS;
+  }
+  for (const auto &data_nodes : graph_datas) {
+    if (data_nodes.second.size() != iter->second.size()) {
+      GELOGE(FAILED, "Subgraph %s has invalid Data nodes[%zu != %zu]", data_nodes.first->GetName().c_str(),
+             data_nodes.second.size(), iter->second.size());
+      return FAILED;
+    }
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Get all Data nodes for all subgraph.
+/// @param [in] node: Const node of subgraph.
+/// @param [out] inputs: parent index to Const.
+/// @param [out] outputs: Data groups of subgraph.
+/// @return true: SUCCESS / false: FAILED
+///
+bool SubgraphConstMigrationPass::GetAssociatedNodes(const NodePtr &node, map<uint32_t, uint32_t> &inputs,
+                                                    map<uint32_t, uint32_t> &outputs) {
+  for (uint32_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) {
+    outputs[i] = kInvalidParent;
+  }
+
+  uint32_t out_index = 0;
+  const auto in_nodes = node->GetInAllNodes();
+  for (size_t i = 0; i < in_nodes.size(); ++i) {
+    const auto owner_node = in_nodes.at(i);
+    if (owner_node->GetType() != DATA) {
+      return false;
+    }
+
+    uint32_t parent_index = 0;
+    if (!AttrUtils::GetInt(owner_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+      return false;
+    }
+
+    // Input Data feed other Node, need add new Data.
+    inputs[i] = parent_index;
+    if ((out_index == outputs.size()) && owner_node->GetOutDataNodes().empty()) {
+      outputs[out_index] = parent_index;
+      ++out_index;
+    }
+  }
+
+  return true;
+}
+
+///
+/// @ingroup ge
+/// @brief Get all Data nodes for all subgraph.
+/// @param [in] graph_nodes: Data groups of subgraph.
+/// @param [in] data_base: Data Node for migration.
+/// @param [in] data_idx: Data groups of subgraph.
+/// @param [in] data_idx: Data groups of subgraph.
+/// @return true: Same / false: not same
+///
+bool SubgraphConstMigrationPass::IsParallelNodeSame(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
+                                                    const NodePtr &const_node, uint32_t parent_index, size_t index) {
+  auto it = graph_datas.begin();
+  for (++it; it != graph_datas.end(); ++it) {
+    const auto &data_nodes = it->second;
+    auto data_it = data_nodes.find(parent_index);
+    if (data_it == data_nodes.end()) {
+      GELOGE(FAILED, "Data: %s not fount, index: %u", const_node->GetName().c_str(), parent_index);
+      return false;
+    }
+
+    const auto &work_data = data_it->second;
+    const auto &out_anchor = work_data->GetOutControlAnchor();
+    const auto &in_anchors = out_anchor->GetPeerInControlAnchors();
+    if (in_anchors.size() <= index || in_anchors.at(index) == nullptr) {
+      GELOGW("Node anchors not same, Data: %s -> %s anchor size: %zu, index: %zu", work_data->GetName().c_str(),
+             const_node->GetName().c_str(), in_anchors.size(), index);
+      return false;
+    }
+
+    const auto &in_anchor = in_anchors.at(index);
+    const auto &work_node = in_anchor->GetOwnerNode();
+    if (work_node == nullptr) {
+      GELOGE(FAILED, "Data: %s not found, parent: %u, index: %zu", const_node->GetName().c_str(), parent_index, index);
+      return false;
+    }
+
+    if (!IsSameOpNode(const_node, work_node)) {
+      GELOGI("OpDesc not same: %s %s, parent: %u, index: %zu", const_node->GetName().c_str(),
+             work_node->GetName().c_str(), parent_index, index);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+///
+/// @ingroup ge
+/// @brief Migration subgraph Node to Root
+/// @param [in] graph: Root compute graph.
+/// @param [in] func_node: functional Node of Case.
+/// @param [in] graph_nodes: Data groups of subgraph.
+/// @param [in] data_base: Data Node for migration.
+/// @param [in] data_idx: Data groups of subgraph.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status SubgraphConstMigrationPass::GraphNodeMigration(const ComputeGraphPtr &graph, const NodePtr &func_node,
+                                                      map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
+                                                      const NodePtr &data_node, uint32_t parent_index) {
+  bool can_extrapolation = false;
+  do {
+    can_extrapolation = false;
+    const auto &out_anchor = data_node->GetOutControlAnchor();
+    const auto &in_anchors = out_anchor->GetPeerInControlAnchors();
+    for (size_t i = in_anchors.size(); i > 0; --i) {
+      const auto &in_anchor = in_anchors.at(i - 1);
+      const auto &work_node = in_anchor->GetOwnerNode();
+      GELOGD("Data: %s, node: %s, parent: %u, index: %zu", data_node->GetName().c_str(), work_node->GetName().c_str(),
+             parent_index, i);
+      if (work_node->GetType() != CONSTANT) {
+        continue;
+      }
+
+      // Get associated Data, if Data feed other nodes, need append new Data.
+      map<uint32_t, uint32_t> inputs;
+      map<uint32_t, uint32_t> outputs;
+      if (!GetAssociatedNodes(work_node, inputs, outputs)) {
+        continue;
+      }
+
+      if (!IsParallelNodeSame(graph_datas, work_node, parent_index, i - 1)) {
+        continue;
+      }
+
+      GELOGI("Move node: %s, parent: %u, index: %zu", work_node->GetName().c_str(), parent_index, i);
+      if (AppendParallelNode(graph_datas, func_node, outputs) != SUCCESS) {
+        return FAILED;
+      }
+
+      if (MoveNodeToParent(graph, func_node, graph_datas, parent_index, i - 1, inputs, outputs) != SUCCESS) {
+        return FAILED;
+      }
+      can_extrapolation = true;
+      break;
+    }
+  } while (can_extrapolation);
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Append Input Tensor for functional node.
+/// @param [in] graph_nodes: Data groups of subgraph.
+/// @param [in] func_node: functional Node of Case.
+/// @param [in] outputs: Parent index of Node output.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status SubgraphConstMigrationPass::AppendParallelNode(map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
+                                                      const NodePtr &func_node, map<uint32_t, uint32_t> &outputs) {
+  // If outputs index invalid, add Data and Input Tensor.
+  for (auto &item : outputs) {
+    if (item.second != kInvalidParent) {
+      continue;
+    }
+
+    // Add Data to subgraph.
+    map<ComputeGraphPtr, uint32_t> append_num;
+    for (auto &groups : graph_datas) {
+      const auto &subgraph = groups.first;
+      auto &data_nodes = groups.second;
+
+      item.second = func_node->GetAllInDataAnchorsSize() + append_num[subgraph];  // Update to valid parent index.
+      const auto data_name = subgraph->GetName() + "_data_" + std::to_string(item.second);
+
+      OpDescBuilder op_builder(data_name, DATA);
+      const OpDescPtr op_desc = op_builder.AddInput("x").AddOutput("y").Build();
+      if (op_desc == nullptr) {
+        GELOGE(OUT_OF_MEMORY, "Create multi-batch subgraph data desc failed");
+        return OUT_OF_MEMORY;
+      }
+
+      uint32_t data_index = item.second - kCaseInputBase;
+      if (!AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
+        GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
+        return FAILED;
+      }
+
+      if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, item.second)) {
+        GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
+        return FAILED;
+      }
+
+      append_num[subgraph]++;
+      data_nodes[item.second] = subgraph->AddNode(op_desc);
+      GELOGI("Add Node: %s, parent index: %u", op_desc->GetName().c_str(), item.second);
+    }
+
+    // Add InputTensor to functional Node.
+    NodeUtils::AppendInputAnchor(func_node, item.second + 1);
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Delete Node from all subgraph.
+/// @param [in] graph_nodes: Data groups of subgraph.
+/// @param [in] detach: Node will move to parent.
+/// @param [in] outputs: Parent index of Node output.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status SubgraphConstMigrationPass::DetachParallelNode(const map<uint32_t, NodePtr> &graph_datas, const NodePtr &detach,
+                                                      const map<uint32_t, uint32_t> &outputs) {
+  // Break Data and Move node.
+  const auto &in_anchor = detach->GetInControlAnchor();
+  const auto &out_anchors = in_anchor->GetPeerOutControlAnchors();
+  for (size_t i = out_anchors.size(); i > 0; --i) {
+    const auto &out_anchor = out_anchors.at(i - 1);
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
+    const auto &owner_node = out_anchor->GetOwnerNode();
+    GELOGI("Remove Edge: %s %s", owner_node->GetName().c_str(), detach->GetName().c_str());
+  }
+
+  // Break Move and follow, Link Data and follow.
+  for (uint32_t i = 0; i < detach->GetAllOutDataAnchorsSize(); ++i) {
+    auto it_idx = outputs.find(i);
+    if (it_idx == outputs.end()) {
+      GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i);
+      return FAILED;
+    }
+
+    auto it_data = graph_datas.find(it_idx->second);
+    if (it_data == graph_datas.end()) {
+      GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i);
+      return FAILED;
+    }
+
+    const auto &data_node = it_data->second;
+    const auto &out_anchor = detach->GetOutDataAnchor(i);
+
+    const auto &out_desc = detach->GetOpDesc()->GetOutputDesc(i);
+    const auto &data_desc = data_node->GetOpDesc();
+    (void)data_desc->UpdateInputDesc(kDataOutIndex, out_desc);   // Set Data Input to new connect Node.
+    (void)data_desc->UpdateOutputDesc(kDataOutIndex, out_desc);  // Set Data Output to new connect Node.
+
+    for (const auto &in_anchor : out_anchor->GetPeerInDataAnchors()) {
+      if (in_anchor == nullptr) {
+        continue;
+      }
+      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
+      const auto &owner_node = in_anchor->GetOwnerNode();
+      GELOGI("Remove Edge: %s %s", detach->GetName().c_str(), owner_node->GetName().c_str());
+
+      const auto &data_out_anchor = data_node->GetOutDataAnchor(kDataOutIndex);
+      GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(data_out_anchor, in_anchor), "Add edge failed");
+      GELOGI("Add Edge: %s %s", data_node->GetName().c_str(), owner_node->GetName().c_str());
+    }
+  }
+
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Move Node to Parent Graph.
+/// @param [in] graph: Parent compute graph.
+/// @param [in] func_node: functional Node of Case.
+/// @param [in] attach: Node will move to parent.
+/// @param [in] inputs: Parent index of Node input.
+/// @param [in] outputs: Parent index of Node output.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status SubgraphConstMigrationPass::AttachParallelNode(const ComputeGraphPtr &graph, const NodePtr &func_node,
+                                                      const NodePtr &attach, const map<uint32_t, uint32_t> &inputs,
+                                                      const map<uint32_t, uint32_t> &outputs) {
+  GE_CHECK_NOTNULL(attach);
+  for (const auto item : inputs) {
+    if (item.second == kInvalidParent) {  // Not connect, Skip.
+      continue;
+    }
+
+    const auto &in_anchor = func_node->GetInDataAnchor(item.second);
+    const auto &out_anchor = in_anchor->GetPeerOutAnchor();
+    const auto &owner_node = out_anchor->GetOwnerNode();
+    const auto &in_control = attach->GetInControlAnchor();
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(owner_node->GetOutControlAnchor(), in_control), "Add edge failed");
+    GELOGI("Add Edge: %s %s", owner_node->GetName().c_str(), attach->GetName().c_str());
+  }
+
+  for (const auto &item : outputs) {
+    const auto &func_desc = func_node->GetOpDesc();
+    const auto &out_desc = attach->GetOpDesc()->GetOutputDesc(item.second);
+    (void)func_desc->UpdateInputDesc(item.second, out_desc);  // Set Data Input to new connect Node.
+
+    const auto &in_anchor = func_node->GetInDataAnchor(item.second);
+    const auto &out_anchor = in_anchor->GetPeerOutAnchor();
+    if (out_anchor != nullptr) {
+      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
+      const auto &owner_node = out_anchor->GetOwnerNode();
+      GELOGI("Remove Edge: %s %s", owner_node->GetName().c_str(), func_node->GetName().c_str());
+    }
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(attach->GetOutDataAnchor(item.first), in_anchor), "Add edge failed");
+    GELOGI("Add Edge: %s %s", attach->GetName().c_str(), func_node->GetName().c_str());
+  }
+
+  (void)graph->AddNode(attach);
+  (void)attach->SetOwnerComputeGraph(graph);
+  GELOGI("Add Node: %s %s", graph->GetName().c_str(), attach->GetName().c_str());
+  return SUCCESS;
+}
+
+///
+/// @ingroup ge
+/// @brief Move node to Parent graph.
+/// @param [in] graph: Root compute graph.
+/// @param [in] func_node: functional Node of Case.
+/// @param [in] graph_nodes: Data groups of subgraph.
+/// @param [in] index: anchor index of move Node.
+/// @param [in] inputs: Parent index of Node input.
+/// @param [in] outputs: Parent index of Node output.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph, const NodePtr &func_node,
+                                                    const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
+                                                    uint32_t parent_index, uint32_t index,
+                                                    const map<uint32_t, uint32_t> &inputs,
+                                                    const map<uint32_t, uint32_t> &outputs) {
+  if (inputs.empty()) {
+    GELOGE(FAILED, "Graph: %s, inputs is empty", graph->GetName().c_str());
+    return FAILED;
+  }
+
+  NodePtr move_node;
+  for (auto &groups : graph_datas) {
+    const auto &subgraph = groups.first;
+    const auto &data_nodes = groups.second;
+    auto it = data_nodes.find(parent_index);
+    if (it == data_nodes.end()) {
+      GELOGE(FAILED, "Graph: %s, Data: %u node not found", subgraph->GetName().c_str(), parent_index);
+      return FAILED;
+    }
+
+    const auto &base_data = it->second;
+    const auto &out_anchor = base_data->GetOutControlAnchor();
+    const auto &in_anchors = out_anchor->GetPeerInControlAnchors();
+    if (in_anchors.size() <= index || in_anchors.at(index) == nullptr) {
+      GELOGE(FAILED, "Data: %s, anchor size: %zu, index: %u not found", base_data->GetName().c_str(), in_anchors.size(),
+             index);
+      return FAILED;
+    }
+
+    const auto &in_anchor = in_anchors.at(index);
+    move_node = in_anchor->GetOwnerNode();
+    if (move_node == nullptr) {
+      GELOGE(FAILED, "Data: %s not found, index: %u", base_data->GetName().c_str(), parent_index);
+      return FAILED;
+    }
+
+    if (DetachParallelNode(data_nodes, move_node, outputs) != SUCCESS) {
+      GELOGE(FAILED, "Data: %s not found, index: %u", base_data->GetName().c_str(), parent_index);
+      return FAILED;
+    }
+
+    GE_CHK_GRAPH_STATUS_RET(subgraph->RemoveNode(move_node), "Remove node failed");
+    GELOGI("Remove Node: %s %s", subgraph->GetName().c_str(), move_node->GetName().c_str());
+  }
+
+  if (AttachParallelNode(graph, func_node, move_node, inputs, outputs) != SUCCESS) {
+    return FAILED;
+  }
+
+  migration_append_ = true;
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/src/ge/graph/passes/subgraph_const_migration_pass.h b/src/ge/graph/passes/subgraph_const_migration_pass.h
new file mode 100644
index 00000000..604cdb02
--- /dev/null
+++ b/src/ge/graph/passes/subgraph_const_migration_pass.h
@@ -0,0 +1,138 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_COMMON_SUBGRAPH_CONST_MIGRATION_H_
+#define GE_COMMON_SUBGRAPH_CONST_MIGRATION_H_
+
+#include "graph/types.h"
+#include "inc/graph_pass.h"
+
+#include <map>
+#include <set>
+#include <vector>
+#include <string>
+
+using std::map;
+using std::set;
+
+namespace ge {
+class SubgraphConstMigrationPass : public GraphPass {
+ public:
+  Status Run(ComputeGraphPtr graph) override;
+
+ private:
+  ///
+  /// @ingroup ge
+  /// @brief Get all Data nodes for all subgraph.
+  /// @param [in] graph: Root compute graph.
+  /// @param [in] func_desc: functional OpDesc of Case.
+  /// @param [out] graph_datas: Data groups of subgraph.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status ClassifyDataNodes(const ComputeGraphPtr &graph, const OpDescPtr &func_desc,
+                           map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas);
+
+  ///
+  /// @ingroup ge
+  /// @brief Get all Data nodes for all subgraph.
+  /// @param [in] node: Const node of subgraph.
+  /// @param [in] func_desc: functional OpDesc of Case.
+  /// @param [out] graph_nodes: Data groups of subgraph.
+  /// @return true: SUCCESS / false: FAILED
+  ///
+  bool GetAssociatedNodes(const NodePtr &node, map<uint32_t, uint32_t> &inputs, map<uint32_t, uint32_t> &outputs);
+
+  ///
+  /// @ingroup ge
+  /// @brief Get all Data nodes for all subgraph.
+  /// @param [in] graph_nodes: Data groups of subgraph.
+  /// @param [in] data_base: Data Node for migration.
+  /// @param [in] data_idx: Data groups of subgraph.
+  /// @param [in] data_idx: Data groups of subgraph.
+  /// @return true: Same / false: not same
+  ///
+  bool IsParallelNodeSame(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes, const NodePtr &const_node,
+                          uint32_t parent_index, size_t index);
+
+  ///
+  /// @ingroup ge
+  /// @brief Migration subgraph Node to Root
+  /// @param [in] graph: Root compute graph.
+  /// @param [in] func_node: functional Node of Case.
+  /// @param [in] graph_nodes: Data groups of subgraph.
+  /// @param [in] data_base: Data Node for migration.
+  /// @param [in] data_idx: Data groups of subgraph.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status GraphNodeMigration(const ComputeGraphPtr &graph, const NodePtr &func_node,
+                            map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes, const NodePtr &data_base,
+                            uint32_t data_idx);
+
+  ///
+  /// @ingroup ge
+  /// @brief Move node to Parent graph.
+  /// @param [in] graph: Root compute graph.
+  /// @param [in] func_node: functional Node of Case.
+  /// @param [in] graph_nodes: Data groups of subgraph.
+  /// @param [in] anchor_idx: anchor index of move Node.
+  /// @param [in] inputs: Parent index of Node input.
+  /// @param [in] outputs: Parent index of Node output.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status MoveNodeToParent(const ComputeGraphPtr &graph, const NodePtr &func_node,
+                          const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes, uint32_t parent_index,
+                          uint32_t anchor_idx, const map<uint32_t, uint32_t> &inputs,
+                          const map<uint32_t, uint32_t> &outputs);
+
+  ///
+  /// @ingroup ge
+  /// @brief Append Input Tensor for functional node.
+  /// @param [in] graph_nodes: Data groups of subgraph.
+  /// @param [in] func_node: functional Node of Case.
+  /// @param [in] outputs: Parent index of Node output.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status AppendParallelNode(map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes, const NodePtr &func_node,
+                            map<uint32_t, uint32_t> &outputs);
+
+  ///
+  /// @ingroup ge
+  /// @brief Delete Node from all subgraph.
+  /// @param [in] graph_nodes: Data groups of subgraph.
+  /// @param [in] detach: Node will move to parent.
+  /// @param [in] outputs: Parent index of Node output.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status DetachParallelNode(const map<uint32_t, NodePtr> &graph_datas, const NodePtr &detach,
+                            const map<uint32_t, uint32_t> &outputs);
+
+  ///
+  /// @ingroup ge
+  /// @brief Move Node to Parent Graph.
+  /// @param [in] graph: Parent compute graph.
+  /// @param [in] func_node: functional Node of Case.
+  /// @param [in] attach: Node will move to parent.
+  /// @param [in] inputs: Parent index of Node input.
+  /// @param [in] outputs: Parent index of Node output.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status AttachParallelNode(const ComputeGraphPtr &graph, const NodePtr &func_node, const NodePtr &attach,
+                            const map<uint32_t, uint32_t> &inputs, const map<uint32_t, uint32_t> &outputs);
+
+  bool migration_append_{false};
+};
+}  // namespace ge
+#endif  // GE_COMMON_SUBGRAPH_CONST_MIGRATION_H_
\ No newline at end of file
diff --git a/src/ge/graph/passes/unused_args_clean_pass.cc b/src/ge/graph/passes/unused_args_clean_pass.cc
index 62094631..3f79fc79 100644
--- a/src/ge/graph/passes/unused_args_clean_pass.cc
+++ b/src/ge/graph/passes/unused_args_clean_pass.cc
@@ -161,6 +161,10 @@ Status UnusedArgsCleanPass::UpdateInputTensor(const map<ComputeGraphPtr, map<uin
   const auto &out_anchor = old_anchor->GetPeerOutAnchor();
   const auto &out_node = out_anchor->GetOwnerNode();
 
+  const auto &func_desc = func_node->GetOpDesc();
+  const auto &old_desc = func_desc->GetInputDesc(parent_index);
+  (void)func_desc->UpdateInputDesc(update_index, old_desc);
+
   GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(out_anchor, new_anchor), "Add edge failed");
   GELOGI("Add edge success, func node: %s, node: %s, parent index: %u, update index: %u", func_node->GetName().c_str(),
          out_node->GetName().c_str(), parent_index, update_index);
diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc
index 545fe66f..8a6e0bdc 100644
--- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc
+++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc
@@ -790,22 +790,24 @@ Status AippOp::AddAttrToAippData(const OpDescPtr &aipp_data_op_desc) {
 }
 
 Status AippOp::AddNodeToGraph(const NodePtr &aipp_node, int64_t max_dynamic_aipp_size) {
-  static int index = 0;
   std::vector<int64_t> input_shape_dim(1, max_dynamic_aipp_size);
   GeShape input_shape(input_shape_dim);
   // construct input tensor
   GeTensorDesc input_tensor(input_shape, FORMAT_ND, DT_UINT8);
   TensorUtils::SetReuseInput(input_tensor, false);
   TensorUtils::SetSize(input_tensor, max_dynamic_aipp_size);
-
+  GE_CHECK_NOTNULL(aipp_node);
   const ComputeGraphPtr &graph = aipp_node->GetOwnerComputeGraph();
   string node_name;
-  if (index == 0) {
+  // First aippdata name should be definite.
+  if (graph->FindFirstNodeMatchType(AIPPDATA) == nullptr) {
+    GELOGI("Current graph has no aippdata node, so the name of it must be definite.");
     node_name = kDynamicAippData;
   } else {
-    node_name = string(kDynamicAippData) + "_" + to_string(index);
+    node_name = string(kDynamicAippData) + "_" + aipp_node->GetName();
   }
-  ++index;
+  GELOGI("Current add aippdata node name is %s", node_name.c_str());
+
   // new add aipp_data ops for dynamic aipp param input
   OpDescPtr op_desc_ptr_data = MakeShared<OpDesc>(node_name, AIPPDATA);
   GE_CHECK_NOTNULL(op_desc_ptr_data);
diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
index 83a16e75..a2f9c25c 100644
--- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
+++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
@@ -132,7 +132,6 @@ Status InsertNewOpUtil::CheckPositionNotRepeat() {
                       return PARAM_INVALID;);
     }
   }
-
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
index 336527fb..b22e4566 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
@@ -55,9 +55,6 @@ const int kDataOutIndex = 0;
 const int kDataInIndex = 0;
 const int kMergeDataOutIndex = 0;
 const int kStaticOutput = -1;
-const int kDynmaicDims = -1;
-const int kDynamicBatchDynamicDimsNum = 1;
-const int kDynamicImgSizeDynamciDimsNum = 2;
 
 inline bool IsDataLikeType(const std::string &node_type) { return (node_type == DATA) || (node_type == AIPP); }
 
@@ -213,16 +210,16 @@ Status MultiBatchGraphCopyer::CopyGraph() {
     return ret;
   }
 
-  ret = CheckDataShape(origin_data_nodes_);
-  if (ret != SUCCESS) {
-    return ret;
-  }
-
   if (LabelStatus() != SUCCESS) {
     GELOGE(INTERNAL_ERROR, "Failed to label status for all nodes.");
     return INTERNAL_ERROR;
   }
 
+  ret = CheckAndParseDynamicData();
+  if (ret != SUCCESS) {
+    return ret;
+  }
+
   ret = CreateNewNodes();
   if (ret != SUCCESS) {
     return ret;
@@ -316,6 +313,61 @@ Status MultiBatchGraphCopyer::LabelStatus() {
   return SUCCESS;
 }
 
+Status MultiBatchGraphCopyer::CheckAndParseDynamicData() {
+  size_t unknown_shape_count = 0;
+  auto data_name_and_shape = GetLocalOmgContext().user_input_dims;
+  GELOGD("raw data_name_and_shape size: %zu", data_name_and_shape.size());
+  for (const auto &node : origin_all_nodes_) {
+    auto data_desc = NodeUtils::GetOutputDesc(*node, kDataOutIndex);
+    auto data_shape = data_desc.GetShape();
+    auto data_format = data_desc.GetFormat() == Format::FORMAT_NCHW
+                         ? "NCHW"
+                         : data_desc.GetFormat() == Format::FORMAT_NHWC ? "NHWC" : "Others";
+
+    auto data_name = node->GetName();
+    auto branch_status = GetNodeStatus(node);
+    if (branch_status != kNodeStartNode) {
+      continue;
+    }
+    if (IsAllDimsPositive(data_shape.GetDims())) {
+      continue;
+    }
+    ++unknown_shape_count;
+    auto iter = find(data_name_order_.begin(), data_name_order_.end(), data_name);
+    if (iter == data_name_order_.end()) {
+      if (dynamic_type_ == DynamicType::kDynamicBatch) {
+        auto ret = CheckDynamicBatchShape(data_shape.GetDims(), data_name);
+        if (!ret) {
+          return PARAM_INVALID;
+        }
+      } else if (dynamic_type_ == DynamicType::kDynamicImageSize) {
+        auto ret = CheckDynamicImageSizeShape(data_shape.GetDims(), data_name, data_format);
+        if (!ret) {
+          return PARAM_INVALID;
+        }
+      } else if (dynamic_type_ == DynamicType::kDynamicDims) {
+        ErrorManager::GetInstance().ATCReportErrMessage(
+          "E10001", {"parameter", "reason"}, {"--input_shape", "all dynamic data must be set in --input_shape"});
+        GELOGE(INTERNAL_ERROR, "data: %s shape:%s must be set int --input_shape", node->GetName().c_str(),
+               data_shape.ToString().c_str());
+        return INTERNAL_ERROR;
+      }
+      data_name_and_shape.emplace_back(data_name, data_shape.GetDims());
+    }
+  }
+  auto ret = ParserDataToDynmaicInfo(shapes_, data_name_and_shape, data_to_dynamic_info_);
+  if (ret != SUCCESS) {
+    return ret;
+  }
+  if (unknown_shape_count == 0) {
+    ErrorManager::GetInstance().ATCReportErrMessage("E10040");
+    GELOGE(PARAM_INVALID,
+           "Need unknow shape data when user set --dynamic_batch_size, --dynamic_image_size or --dynamic_dims");
+    return PARAM_INVALID;
+  }
+  return SUCCESS;
+}
+
 Status MultiBatchGraphCopyer::CreateNewNodes() {
   shape_data_ = InsertShapeDataNode();
   if (shape_data_ == nullptr) {
@@ -331,10 +383,6 @@ Status MultiBatchGraphCopyer::CreateNewNodes() {
     switch (branch_status) {
       case kNodeStartNode:
         GELOGD("Name: %s, type: %s, status: kNodeStartNode.", node->GetName().c_str(), node->GetType().c_str());
-        ret = UpdateDataToDynamicInfo(node);
-        if (ret != SUCCESS) {
-          break;
-        }
         ret = InsertSwitchNForData(node);
         if (ret == SUCCESS) {
           ret = UpdateMaxShapeToData(node);
@@ -652,7 +700,6 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) {
   auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
   auto data_name = data->GetName();
   (void)AttrUtils::SetListInt(data->GetOpDesc(), ATTR_MBATCH_ORIGIN_INPUT_DIMS, data_shape.GetDims());
-
   if (IsAllDimsPositive(data_shape.GetDims())) {
     GELOGI("The shape of data %s are positive(%s), skip the multi batch process", data->GetName().c_str(),
            data_shape.ToString().c_str());
@@ -731,57 +778,6 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) {
   data_nodes_to_switchn_[data.get()] = switchn;
   return SUCCESS;
 }
-Status MultiBatchGraphCopyer::UpdateDataToDynamicInfo(const NodePtr &data) {
-  auto data_desc = NodeUtils::GetOutputDesc(*data, kDataOutIndex);
-  auto data_shape = data_desc.GetShape();
-  auto data_format = data_desc.GetFormat();
-  auto data_name = data->GetName();
-  if (IsAllDimsPositive(data_shape.GetDims())) {
-    return SUCCESS;
-  }
-  if (data_to_dynamic_info_.find(data_name) == data_to_dynamic_info_.end()) {
-    auto data_shape_dims = data_shape.GetDims();
-    auto dynamic_dims_num = std::count_if(data_shape_dims.begin(), data_shape_dims.end(),
-                                          [&data_shape_dims](int64_t dim) { return dim < 0; });
-    if (dynamic_type_ == DynamicType::kDynamicBatch) {
-      if (dynamic_dims_num != kDynamicBatchDynamicDimsNum || data_shape.GetDim(0) != kDynmaicDims) {
-        GELOGE(INTERNAL_ERROR, "data: %s shape:%s do not satisfy dynamic batch rule", data->GetName().c_str(),
-               data_shape.ToString().c_str());
-        return INTERNAL_ERROR;
-      }
-    } else if (dynamic_type_ == DynamicType::kDynamicImageSize) {
-      int64_t height = 0;
-      int64_t width = 0;
-      if (data_format == FORMAT_NCHW) {
-        height = data_shape.GetDim(NCHW_DIM_H);
-        width = data_shape.GetDim(NCHW_DIM_W);
-      } else if (data_format == FORMAT_NHWC) {
-        height = data_shape.GetDim(NHWC_DIM_H);
-        width = data_shape.GetDim(NHWC_DIM_W);
-      }
-      if (dynamic_dims_num != kDynamicImgSizeDynamciDimsNum || height != kDynmaicDims || width != kDynmaicDims) {
-        GELOGE(INTERNAL_ERROR, "data: %s shape:%s do not satisfy dynamic image size rule", data->GetName().c_str(),
-               data_shape.ToString().c_str());
-        return INTERNAL_ERROR;
-      }
-    } else if (dynamic_type_ == DynamicType::kDynamicDims) {
-      GELOGE(INTERNAL_ERROR, "data: %s shape:%s must be set int --input_shape", data->GetName().c_str(),
-             data_shape.ToString().c_str());
-      return INTERNAL_ERROR;
-    }
-    // all data has dynamic dims are not in atc parameter --input_shape
-    if (data_to_dynamic_info_.empty()) {
-      vector<pair<string, vector<int64_t>>> tmp_data_name_and_shape{std::make_pair(data_name, data_shape_dims)};
-      auto ret = ParserDataToDynmaicInfo(shapes_, tmp_data_name_and_shape, data_to_dynamic_info_);
-      if (ret != SUCCESS) {
-        GELOGE(INTERNAL_ERROR, "parse data : %s dynamic gear info failed", data_name.c_str());
-        return INTERNAL_ERROR;
-      }
-    }
-    data_to_dynamic_info_[data_name] = data_to_dynamic_info_.begin()->second;
-  }
-  return SUCCESS;
-}
 Status MultiBatchGraphCopyer::InsertMergeForEdgeNode(const NodePtr &node) {
   for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
     auto src_out_anchor = in_data_anchor->GetPeerOutAnchor();
@@ -1032,12 +1028,6 @@ Status ProcessMultiBatch(ComputeGraphPtr &graph) {
     GELOGD("There is no multi-batch options, no need to process multi-batch copy");
     return SUCCESS;
   }
-  map<string, vector<vector<int64_t>>> data_to_dynamic_info;
-  // parser data dynamic info from atc parameter --input_shape
-  if (ParserDataToDynmaicInfo(shapes, GetLocalOmgContext().user_input_dims, data_to_dynamic_info) != SUCCESS) {
-    GELOGE(PARAM_INVALID, "Parse each data's own dynamic info failed");
-    return PARAM_INVALID;
-  }
   DynamicType dynamic_type = DynamicType::kDynamicUnknown;
   if (!GetLocalOmgContext().dynamic_batch_size.empty()) {
     dynamic_type = DynamicType::kDynamicBatch;
@@ -1057,7 +1047,6 @@ Status ProcessMultiBatch(ComputeGraphPtr &graph) {
   }
   copyer.SetDynamicType(dynamic_type);
   copyer.SetUserDesignateShape(user_designate_shape);
-  copyer.SetDataToDynamicInfo(data_to_dynamic_info);
   return copyer.CopyGraph();
 }
 
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.h b/src/ge/graph/preprocess/multi_batch_copy_graph.h
index 062b98d2..c75bf981 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.h
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.h
@@ -102,7 +102,7 @@ class MultiBatchGraphCopyer {
   Status LinkNodeToMerge(const NodePtr &node, int out_index, const NodePtr &merge);
   Status CopyInDataEdges(const NodePtr &origin_node, int batch_num, const NodePtr &copyed_node);
   Status CopyInControlEdges(const NodePtr &node, int batch_num, const NodePtr &copyed_node);
-  Status UpdateDataToDynamicInfo(const NodePtr &node);
+  Status CheckAndParseDynamicData();
   bool IsInBatchBranch(const NodePtr &node);
   NodeStatus GetNodeStatus(const NodePtr &node) { return origin_nodes_status_[node.get()]; };
   Status CheckCopyResult(const std::vector<NodePtr> &start_nodes);
diff --git a/src/ge/graph/preprocess/multi_batch_options.cc b/src/ge/graph/preprocess/multi_batch_options.cc
index 005240ca..e22af692 100644
--- a/src/ge/graph/preprocess/multi_batch_options.cc
+++ b/src/ge/graph/preprocess/multi_batch_options.cc
@@ -26,12 +26,18 @@
 #include "graph/utils/node_utils.h"
 #include "graph/ge_context.h"
 #include "graph/common/local_context.h"
+#include "framework/common/types.h"
 
 namespace ge {
 namespace multibatch {
 constexpr int kDecimal = 10;
 constexpr uint8_t kMaxShapesCount = 100;
 constexpr uint8_t kMinShapesCount = 2;
+const int kDynmaicDims = -1;
+const int kDynamicBatchDynamicDimsNum = 1;
+const int kDynamicImgSizeDynamciDimsNum = 2;
+const size_t kMaxNDDimNum = 4;
+const size_t kMinNDDimNum = 1;
 
 void ParseDynamicSize(string dynamic_size, vector<vector<int64_t>> &shapes) {
   std::vector<std::string> shape_strs = ge::StringUtils::Split(dynamic_size, ';');
@@ -252,5 +258,62 @@ Status StampDynamicType(const OpDescPtr &op_desc) {
   }
   return SUCCESS;
 }
+
+///
+/// @ingroup ge
+/// @brief Check dynamic batch Shape.
+/// @param [in] const vector<int64_t> &shape: data_shape to be checked.
+/// @param [in] const string &data_name: cur data name.
+/// @return 0: true/false
+///
+bool CheckDynamicBatchShape(const vector<int64_t> &shape, const string &data_name) {
+  if (shape[0] == kDynmaicDims) {
+    for (size_t i = 1; i < shape.size(); ++i) {
+      if (shape[i] < 1) {
+        ErrorManager::GetInstance().ATCReportErrMessage("E10018", {"index", "shape"},
+                                                        {std::to_string(i), std::to_string(shape[i])});
+        GELOGE(ge::PARAM_INVALID,
+               "Only batch N can be -1 when set --dynamic_batch_size, current data: %s shape[%zu] is %ld",
+               data_name.c_str(), i, shape[i]);
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+///
+/// @ingroup ge
+/// @brief Check Dynamic image size shape.
+/// @param [in] unordered_map<string, vector<int64_t>> &shape_map: map of data_name and data_shape.
+/// @param [in]  const std::string &input_format: format of input.
+/// @return 0: true/false
+///
+bool CheckDynamicImageSizeShape(const vector<int64_t> &shape, const string &data_name,
+                                const std::string &input_format) {
+  int64_t height = 0;
+  int64_t width = 0;
+  if (input_format == "NCHW") {
+    height = shape[NCHW_DIM_H];
+    width = shape[NCHW_DIM_W];
+  }
+
+  if (input_format == "NHWC") {
+    height = shape[NHWC_DIM_H];
+    width = shape[NHWC_DIM_W];
+  }
+
+  if (height == kDynmaicDims && width == kDynmaicDims &&
+      std::count(shape.begin(), shape.end(), kDynmaicDims) == kDynamicImgSizeDynamciDimsNum) {
+    return true;
+  } else {
+    ErrorManager::GetInstance().ATCReportErrMessage("E10019");
+    GELOGE(ge::PARAM_INVALID,
+           "--input_shape's shape is invalid, only height and width can be -1 when set --dynamic_image_size.");
+    return false;
+  }
+}
 }  // namespace multibatch
 }  // namespace ge
diff --git a/src/ge/graph/preprocess/multi_batch_options.h b/src/ge/graph/preprocess/multi_batch_options.h
index 18f667ae..b5616399 100644
--- a/src/ge/graph/preprocess/multi_batch_options.h
+++ b/src/ge/graph/preprocess/multi_batch_options.h
@@ -70,6 +70,27 @@ Status ParserDataToDynmaicInfo(const vector<vector<int64_t>> &shapes,
 /// @return 0: SUCCESS / others: INTERNAL_ERROR
 ///
 Status StampDynamicType(const OpDescPtr &op_desc);
+
+///
+/// @ingroup ge
+/// @brief Check dynamic batch Shape.
+/// @param [in] const vector<int64_t> &shape: data_shape to be checked.
+/// @param [in] const string &data_name: cur data name.
+/// @return 0: true/false
+///
+bool CheckDynamicBatchShape(const vector<int64_t> &shape, const string &data_name);
+
+///
+/// @ingroup ge
+/// @brief Check Dynamic image size shape.
+/// @param [in] unordered_map<string, vector<int64_t>> &shape_map: map of data_name and data_shape.
+/// @param [in] const string &data_name: cur data name.
+/// @param [in] const std::string &input_format: cur data format.
+/// @param [in]  const std::string &input_format: format of input.
+/// @return 0: true/false
+///
+bool CheckDynamicImageSizeShape(const vector<int64_t> &shape, const string &data_name, const std::string &input_format);
+
 }  // namespace multibatch
 }  // namespace ge
 #endif  // GE_GRAPH_PREPROCESS_MULTI_BATCH_OPTIONS_H_
diff --git a/src/ge/host_cpu_engine/CMakeLists.txt b/src/ge/host_cpu_engine/CMakeLists.txt
new file mode 100644
index 00000000..a1c19eac
--- /dev/null
+++ b/src/ge/host_cpu_engine/CMakeLists.txt
@@ -0,0 +1,209 @@
+set(PROTO_LIST
+    "${METADEF_DIR}/proto/task.proto"
+)
+
+protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST})
+
+set(SRC_LIST
+    "engine/host_cpu_engine.cc"
+    "ops_kernel_store/host_cpu_ops_kernel_info.cc"
+    "ops_kernel_store/op/op_factory.cc"
+    "ops_kernel_store/op/host_op.cc" 
+)
+
+set(CPU_OPS_KERNEL_LIST
+    "ops_kernel_store/host_cpu_ops_kernel_builder.cc"
+)
+
+############ libhost_cpu_engine.so ############
+add_library(host_cpu_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
+
+target_compile_options(host_cpu_engine PRIVATE
+    -Werror
+)
+
+target_include_directories(host_cpu_engine PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/inc/framework
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+)
+
+target_link_libraries(host_cpu_engine PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    -Wl,--no-as-needed
+    protobuf
+    c_sec
+    graph
+    register
+    slog
+    runtime
+    -Wl,--as-needed
+)
+
+############ atcstub/libhost_cpu_engine.so ############
+add_library(atc_host_cpu_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
+
+target_compile_options(atc_host_cpu_engine PRIVATE
+    -Werror
+)
+
+target_compile_definitions(atc_host_cpu_engine PRIVATE
+    COMPILE_OMG_PACKAGE
+)
+
+target_include_directories(atc_host_cpu_engine PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/inc/framework
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+)
+
+target_link_libraries(atc_host_cpu_engine PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    -Wl,--no-as-needed
+    protobuf
+    c_sec
+    graph
+    register
+    slog
+    runtime_compile
+    -Wl,--as-needed
+)
+
+set_target_properties(atc_host_cpu_engine PROPERTIES
+    OUTPUT_NAME host_cpu_engine
+    LIBRARY_OUTPUT_DIRECTORY atclib 
+)
+
+############ libhost_cpu_opskernel_builder.so ############
+add_library(host_cpu_opskernel_builder SHARED ${CPU_OPS_KERNEL_LIST})
+
+target_compile_options(host_cpu_opskernel_builder PRIVATE
+    -Werror
+)
+
+target_include_directories(host_cpu_opskernel_builder PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/inc/framework
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+)
+
+target_link_libraries(host_cpu_opskernel_builder PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    -Wl,--no-as-needed
+    protobuf
+    c_sec
+    slog
+    graph
+    register
+    -Wl,--as-needed
+)
+
+############ atclib/libhost_cpu_opskernel_builder.so ############
+add_library(atc_host_cpu_opskernel_builder SHARED ${CPU_OPS_KERNEL_LIST})
+
+target_compile_options(atc_host_cpu_opskernel_builder PRIVATE
+    -Werror
+)
+
+target_include_directories(atc_host_cpu_opskernel_builder PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/inc/framework
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+)
+
+target_link_libraries(atc_host_cpu_opskernel_builder PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    -Wl,--no-as-needed
+    protobuf
+    c_sec
+    slog
+    graph
+    register
+    -Wl,--as-needed
+)
+
+set_target_properties(atc_host_cpu_opskernel_builder PROPERTIES
+    OUTPUT_NAME host_cpu_opskernel_builder
+    LIBRARY_OUTPUT_DIRECTORY atclib
+)
+
+############ libhost_cpu_opskernel_builder.a ############
+add_library(host_cpu_opskernel_builder_static SHARED ${CPU_OPS_KERNEL_LIST})
+
+target_compile_options(host_cpu_opskernel_builder_static PRIVATE
+    -Werror
+)
+
+target_include_directories(host_cpu_opskernel_builder_static PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/inc/framework
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+)
+
+target_link_libraries(host_cpu_opskernel_builder_static PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    protobuf
+    c_sec
+)
+
+############ install ############
+set(INSTALL_BASE_DIR "")
+set(INSTALL_LIBRARY_DIR lib)
+
+install(TARGETS host_cpu_engine host_cpu_opskernel_builder OPTIONAL
+    LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}
+)
+
+install(TARGETS atc_host_cpu_engine atc_host_cpu_opskernel_builder OPTIONAL
+    LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}/atclib
+)
diff --git a/src/ge/host_cpu_engine/module.mk b/src/ge/host_cpu_engine/module.mk
index e35c68c9..3c8e0cc9 100644
--- a/src/ge/host_cpu_engine/module.mk
+++ b/src/ge/host_cpu_engine/module.mk
@@ -15,6 +15,7 @@ local_lib_inc_path :=   proto/task.proto \
                         ${TOPDIR}third_party/protobuf/include \
                         ${TOPDIR}inc/framework \
                         $(TOPDIR)framework/domi \
+                        $(TOPDIR)graphengine/ge \
 
 #compiler for host
 include $(CLEAR_VARS)
@@ -55,3 +56,83 @@ LOCAL_SRC_FILES := $(local_lib_src_files)
 LOCAL_C_INCLUDES := $(local_lib_inc_path)
 
 include ${BUILD_HOST_SHARED_LIBRARY}
+
+#compiler for host ops kernel builder
+include $(CLEAR_VARS)
+LOCAL_MODULE := libhost_cpu_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=
+LOCAL_SHARED_LIBRARIES :=   libprotobuf \
+                            libc_sec \
+                            libslog \
+                            libgraph \
+                            libregister \
+
+LOCAL_SRC_FILES := ops_kernel_store/host_cpu_ops_kernel_builder.cc
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_HOST_SHARED_LIBRARY}
+
+#compiler for host static lib
+include $(CLEAR_VARS)
+LOCAL_MODULE := libhost_cpu_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=   libprotobuf \
+                            libgraph \
+                            libregister \
+
+LOCAL_SHARED_LIBRARIES :=   libc_sec \
+                            libslog \
+
+LOCAL_SRC_FILES := ops_kernel_store/host_cpu_ops_kernel_builder.cc
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_HOST_STATIC_LIBRARY}
+
+#compiler for device static lib
+include $(CLEAR_VARS)
+LOCAL_MODULE := libhost_cpu_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=   libprotobuf \
+                            libgraph \
+                            libregister \
+
+LOCAL_SHARED_LIBRARIES :=   libc_sec \
+                            libslog \
+
+LOCAL_SRC_FILES := ops_kernel_store/host_cpu_ops_kernel_builder.cc
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_STATIC_LIBRARY}
+
+#compiler for atc ops kernel builder
+include $(CLEAR_VARS)
+LOCAL_MODULE := atclib/libhost_cpu_opskernel_builder
+LOCAL_CFLAGS += -Werror
+LOCAL_CFLAGS += -std=c++11
+LOCAL_LDFLAGS :=
+
+LOCAL_STATIC_LIBRARIES :=
+LOCAL_SHARED_LIBRARIES :=   libprotobuf \
+                            libc_sec \
+                            libslog \
+                            libgraph \
+                            libregister \
+
+LOCAL_SRC_FILES := ops_kernel_store/host_cpu_ops_kernel_builder.cc
+
+LOCAL_C_INCLUDES := $(local_lib_inc_path)
+
+include ${BUILD_HOST_SHARED_LIBRARY}
diff --git a/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc
new file mode 100644
index 00000000..8ab889f0
--- /dev/null
+++ b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "host_cpu_ops_kernel_builder.h"
+#include <memory>
+#include "common/ge_inner_error_codes.h"
+#include "ge/ge_api_types.h"
+#include "graph/utils/node_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/utils/type_utils.h"
+#include "framework/common/debug/ge_log.h"
+#include "host_cpu_engine/common/constant/constant.h"
+#include "register/ops_kernel_builder_registry.h"
+
+namespace ge {
+namespace host_cpu {
+REGISTER_OPS_KERNEL_BUILDER(kHostCpuOpKernelLibName, HostCpuOpsKernelBuilder);
+
+Status HostCpuOpsKernelBuilder::Finalize() { return SUCCESS; }
+Status HostCpuOpsKernelBuilder::Initialize(const map<std::string, std::string> &options) { return SUCCESS; }
+
+Status HostCpuOpsKernelBuilder::CalcOpRunningParam(Node &ge_node) {
+  OpDescPtr op_desc = ge_node.GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(FAILED, "CalcOpRunningParam failed, as op desc is null");
+    return FAILED;
+  }
+
+  bool is_shape_unknown = false;
+  if (NodeUtils::GetNodeUnknownShapeStatus(ge_node, is_shape_unknown) == GRAPH_SUCCESS) {
+    if (is_shape_unknown) {
+      GELOGI("op:%s is unknown shape, does not need to calc output size.", ge_node.GetName().c_str());
+      return SUCCESS;
+    }
+  }
+
+  const string name = ge_node.GetName();
+  const string type = ge_node.GetType();
+  GELOGD("Calc op[%s:%s] running param, output size=%zu.", name.c_str(), type.c_str(), op_desc->GetOutputsSize());
+
+  for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
+    GeTensorDesc output_tensor = op_desc->GetOutputDesc(static_cast<uint32_t>(i));
+    Format format = output_tensor.GetFormat();
+    DataType data_type = output_tensor.GetDataType();
+
+    int64_t mem_size = 0;
+    // If mem size has been set, no need reset.
+    if ((TensorUtils::GetSize(output_tensor, mem_size) == GRAPH_SUCCESS) && (mem_size > 0)) {
+      GELOGD("Op[%s:%s] out[%zu] mem size has been set, no need calc again, format=%s, data_type=%s, mem_size=%ld.",
+             name.c_str(), type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
+             TypeUtils::DataTypeToSerialString(data_type).c_str(), mem_size);
+      continue;
+    }
+
+    int64_t output_mem_size = 0;
+    GeShape output_shape = output_tensor.GetShape();
+    if ((TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size) != GRAPH_SUCCESS) ||
+        (output_mem_size < 0)) {
+      GELOGE(FAILED, "Calc op[%s:%s] out[%zu] mem size failed, mem_size=%ld, format=%s, data_type=%s.", name.c_str(),
+             type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(),
+             TypeUtils::DataTypeToSerialString(data_type).c_str());
+      return FAILED;
+    }
+    GELOGI("Calc op[%s:%s] out[%zu] mem size is %ld, format=%s, data_type=%s.", name.c_str(), type.c_str(), i,
+           output_mem_size, TypeUtils::FormatToSerialString(format).c_str(),
+           TypeUtils::DataTypeToSerialString(data_type).c_str());
+
+    TensorUtils::SetSize(output_tensor, output_mem_size);
+    if (op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "Update op[%s:%s] out[%zu] desc failed, format=%s, data_type=%s.", name.c_str(), type.c_str(), i,
+             TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str());
+      return FAILED;
+    }
+  }
+
+  GELOGD("Calc op[%s:%s] running param success.", name.c_str(), type.c_str());
+  return SUCCESS;
+}
+
+Status HostCpuOpsKernelBuilder::GenerateTask(const Node &node, RunContext &context, vector<domi::TaskDef> &tasks) {
+  // no need to generate device task
+  return SUCCESS;
+}
+}  // namespace host_cpu
+}  // namespace ge
\ No newline at end of file
diff --git a/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.h b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.h
new file mode 100644
index 00000000..c1c78a19
--- /dev/null
+++ b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_HOST_CPU_ENGINE_OPS_KERNEL_STORE_HOST_CPU_OPS_KERNEL_BUILDER_H_
+#define GE_HOST_CPU_ENGINE_OPS_KERNEL_STORE_HOST_CPU_OPS_KERNEL_BUILDER_H_
+
+#include "common/opskernel/ops_kernel_builder.h"
+
+namespace ge {
+namespace host_cpu {
+class HostCpuOpsKernelBuilder : public OpsKernelBuilder {
+ public:
+  Status Initialize(const map<std::string, std::string> &options) override;
+
+  Status Finalize() override;
+
+  Status CalcOpRunningParam(Node &node) override;
+
+  Status GenerateTask(const Node &node, RunContext &context, std::vector<domi::TaskDef> &tasks) override;
+};
+}  // namespace host_cpu
+}  // namespace ge
+
+#endif  // GE_HOST_CPU_ENGINE_OPS_KERNEL_STORE_HOST_CPU_OPS_KERNEL_BUILDER_H_
diff --git a/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.cc b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.cc
index 4e7be2d5..dfdcf432 100644
--- a/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.cc
+++ b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.cc
@@ -18,14 +18,11 @@
 #include <memory>
 #include "common/constant/constant.h"
 #include "ge/ge_api_types.h"
-#include "common/ge/ge_util.h"
-#include "common/ge_inner_error_codes.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
 #include "op/op_factory.h"
-#include "proto/task.pb.h"
 
 namespace ge {
 namespace host_cpu {
@@ -58,71 +55,8 @@ Status HostCpuOpsKernelInfoStore::Finalize() {
   return SUCCESS;
 }
 
-Status HostCpuOpsKernelInfoStore::CalcOpRunningParam(Node &ge_node) {
-  OpDescPtr op_desc = ge_node.GetOpDesc();
-  if (op_desc == nullptr) {
-    GELOGE(FAILED, "CalcOpRunningParam failed, as op desc is null");
-    return FAILED;
-  }
-
-  bool is_shape_unknown = false;
-  if (NodeUtils::GetNodeUnknownShapeStatus(ge_node, is_shape_unknown) == GRAPH_SUCCESS) {
-    if (is_shape_unknown) {
-      GELOGI("op:%s is unknown shape, does not need to calc output size.", ge_node.GetName().c_str());
-      return SUCCESS;
-    }
-  }
-
-  const string name = ge_node.GetName();
-  const string type = ge_node.GetType();
-  GELOGD("Calc op[%s:%s] running param, output size=%zu.", name.c_str(), type.c_str(), op_desc->GetOutputsSize());
-
-  for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
-    GeTensorDesc output_tensor = op_desc->GetOutputDesc(static_cast<uint32_t>(i));
-    Format format = output_tensor.GetFormat();
-    DataType data_type = output_tensor.GetDataType();
-
-    int64_t mem_size = 0;
-    // If mem size has been set, no need reset.
-    if ((TensorUtils::GetSize(output_tensor, mem_size) == GRAPH_SUCCESS) && (mem_size > 0)) {
-      GELOGD("Op[%s:%s] out[%zu] mem size has been set, no need calc again, format=%s, data_type=%s, mem_size=%ld.",
-             name.c_str(), type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(),
-             TypeUtils::DataTypeToSerialString(data_type).c_str(), mem_size);
-      continue;
-    }
-
-    int64_t output_mem_size = 0;
-    GeShape output_shape = output_tensor.GetShape();
-    if ((TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size) != GRAPH_SUCCESS) ||
-        (output_mem_size < 0)) {
-      GELOGE(FAILED, "Calc op[%s:%s] out[%zu] mem size failed, mem_size=%ld, format=%s, data_type=%s.", name.c_str(),
-             type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(),
-             TypeUtils::DataTypeToSerialString(data_type).c_str());
-      return FAILED;
-    }
-    GELOGI("Calc op[%s:%s] out[%zu] mem size is %ld, format=%s, data_type=%s.", name.c_str(), type.c_str(), i,
-           output_mem_size, TypeUtils::FormatToSerialString(format).c_str(),
-           TypeUtils::DataTypeToSerialString(data_type).c_str());
-
-    TensorUtils::SetSize(output_tensor, output_mem_size);
-    if (op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor) != GRAPH_SUCCESS) {
-      GELOGE(FAILED, "Update op[%s:%s] out[%zu] desc failed, format=%s, data_type=%s.", name.c_str(), type.c_str(), i,
-             TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str());
-      return FAILED;
-    }
-  }
-
-  GELOGD("Calc op[%s:%s] running param success.", name.c_str(), type.c_str());
-  return SUCCESS;
-}
-
 void HostCpuOpsKernelInfoStore::GetAllOpsKernelInfo(map<string, OpInfo> &infos) const { infos = op_info_map_; }
 
-Status HostCpuOpsKernelInfoStore::GenerateTask(const Node &node, RunContext &context, vector<TaskDef> &tasks) {
-  // no need to generate device task
-  return SUCCESS;
-}
-
 bool HostCpuOpsKernelInfoStore::CheckSupported(const OpDescPtr &op_desc, std::string &) const {
   if (op_desc == nullptr) {
     return false;
diff --git a/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.h b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.h
index 1202cc8a..f477af52 100644
--- a/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.h
+++ b/src/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_info.h
@@ -57,22 +57,6 @@ class HostCpuOpsKernelInfoStore : public OpsKernelInfoStore {
    */
   void GetAllOpsKernelInfo(std::map<std::string, ge::OpInfo> &infos) const override;
 
-  /**
-   * Calc the running size of Operator,
-   * then GE will alloc the mem size from runtime
-   * @param ge_node Node information
-   * @return status whether this operation success
-   */
-  Status CalcOpRunningParam(ge::Node &ge_node) override;
-
-  /**
-   * call the runtime's interface to generate the task
-   * @param node Node information
-   * @param context run context info
-   * @return status whether this operation success
-   */
-  Status GenerateTask(const ge::Node &ge_node, ge::RunContext &context, std::vector<domi::TaskDef> &tasks) override;
-
   HostCpuOpsKernelInfoStore(const HostCpuOpsKernelInfoStore &ops_kernel_store) = delete;
   HostCpuOpsKernelInfoStore(const HostCpuOpsKernelInfoStore &&ops_kernel_store) = delete;
   HostCpuOpsKernelInfoStore &operator=(const HostCpuOpsKernelInfoStore &ops_kernel_store) = delete;
diff --git a/src/ge/host_cpu_engine/proto/task.proto b/src/ge/host_cpu_engine/proto/task.proto
deleted file mode 120000
index 36ae4847..00000000
--- a/src/ge/host_cpu_engine/proto/task.proto
+++ /dev/null
@@ -1 +0,0 @@
-../../proto/task.proto
\ No newline at end of file
diff --git a/src/ge/host_cpu_engine/proto/task.proto b/src/ge/host_cpu_engine/proto/task.proto
new file mode 100644
index 00000000..50ea061b
--- /dev/null
+++ b/src/ge/host_cpu_engine/proto/task.proto
@@ -0,0 +1,170 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package domi;
+
+message ModelTaskDef {
+    string version = 1;
+
+    map<string, string> attr = 9; // Extended field
+    repeated TaskDef task = 10;
+
+    uint64 memory_size = 11;
+    uint32 stream_num = 12;
+    uint32 event_num = 13;
+    uint64 weight_size = 14;
+
+    repeated bytes op = 15; // input/output opdef in bytes
+
+    uint64 base_addr = 16;    // base addr
+    uint64 weight_addr = 17;  // weight addr
+    uint32 batch_num = 18;
+}
+
+
+message TaskDef {
+    uint32 id = 1;
+    uint32 type = 2;
+
+    uint32 stream_id = 10;
+    uint32 event_id = 11;
+
+    KernelDef kernel = 20;
+    KernelExDef kernel_ex = 21;
+    KernelHcclDef kernel_hccl = 25;
+    EventExDef event_ex = 26;
+    LogTimeStampDef log_timestamp = 28;
+
+    uint32 label_id = 30;
+
+    MemcpyAsyncDef memcpy_async = 31;
+    StreamSwitchDef stream_switch = 32;
+    StreamActiveDef stream_active = 33;
+    bytes private_def = 34;
+    uint64 ops_kernel_store_ptr = 35;      // adjustments to other fields in the future
+    StreamSwitchNDef stream_switch_n = 36;
+
+    LabelSetDef label_set = 37;
+    LabelGotoExDef label_goto_ex = 38;
+    LabelSwitchByIndexDef label_switch_by_index = 39;
+}
+
+message KernelDef {
+    KernelContext context = 1;
+
+    string stub_func = 10;
+    uint32 block_dim = 11;
+    uint32 args_size = 12;
+    bytes args = 13;
+    bytes sm_desc = 14;
+    bytes flowtable = 15;
+    string so_name = 16;
+    string kernel_name = 17;
+    bytes kernel_ext_info = 18;
+    uint32 kernel_ext_info_size = 19;
+}
+
+message KernelContext {
+    uint32 kernel_type = 1;
+    uint32 op_id = 2;                              // OP type in CCE
+    uint32 kernel_func_id = 3;
+    uint32 op_index = 4;                           // TE/Custom operator
+    bool is_flowtable = 5;                         // Identify whether args is a flowtable structure
+    bytes args_offset = 6;                         // args offset information
+    uint32 args_count = 7;                         // args count
+    repeated uint32 origin_op_index = 8;
+}
+
+
+message KernelExDef {
+    uint32 flags = 1;
+
+    uint32 op_index = 4;
+    uint32 args_size = 12;
+    bytes args = 13;
+    bytes task_info = 14;                 // serialized nodeDef, funcDef, inputoutput
+    uint32 task_info_size = 15;
+    bytes kernel_ext_info = 16;
+    uint32 kernel_ext_info_size = 17;
+}
+
+
+message KernelHcclDef {
+    uint32 op_index = 8;
+    string hccl_type = 9;
+}
+
+
+message EventExDef {
+    uint32 op_index = 1;
+    uint32 event_type = 2;
+}
+
+message LogTimeStampDef {
+    uint64 logid = 1;
+    bool notify = 2;
+    uint32 flat = 3;
+}
+
+message MemcpyAsyncDef {
+    uint64 dst = 1;
+    uint64 dst_max = 2;
+    uint64 src = 3;
+    uint64 count = 4;
+    uint32 kind = 5;
+    uint32 op_index = 6;
+}
+
+message StreamSwitchDef {
+    uint32 op_index = 1;
+    uint32 true_stream_id = 2;
+    int64 value = 3;
+    uint64 value_ptr = 4;
+    uint32 data_type = 5;
+}
+
+message StreamActiveDef {
+    uint32 op_index = 1;
+    uint32 active_stream_id = 2;
+}
+
+message StreamSwitchNDef {
+    uint32 op_index = 1;
+    uint32 size = 2;
+    repeated int64 target_value = 3;
+    repeated uint32 true_stream_id = 4;
+    uint32 element_size = 5;
+    uint32 data_type = 6;
+}
+
+message LabelSetDef {
+    uint32 op_index = 1;
+    uint32 label_id = 2;
+    uint32 model_id = 3;
+}
+
+message LabelGotoExDef {
+    uint32 op_index = 1;
+    uint32 label_id = 2;
+    uint32 model_id = 3;
+}
+
+message LabelSwitchByIndexDef {
+    uint32 op_index = 1;
+    uint32 label_max = 2;
+}
diff --git a/src/ge/host_kernels/concat_v2_kernel.cc b/src/ge/host_kernels/concat_v2_kernel.cc
index c46b4277..a77fdc42 100644
--- a/src/ge/host_kernels/concat_v2_kernel.cc
+++ b/src/ge/host_kernels/concat_v2_kernel.cc
@@ -21,12 +21,12 @@
 
 #include "common/debug/log.h"
 #include "common/fp16_t.h"
-#include "common/ge_inner_error_codes.h"
 #include "common/op/ge_op_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "host_kernels/kernel_utils.h"
 #include "graph/utils/type_utils.h"
 #include "inc/kernel_factory.h"
+#include "framework/common/types.h"
 
 namespace ge {
 namespace {
diff --git a/src/ge/host_kernels/fill_kernel.cc b/src/ge/host_kernels/fill_kernel.cc
index 27bcb9aa..86aec04b 100644
--- a/src/ge/host_kernels/fill_kernel.cc
+++ b/src/ge/host_kernels/fill_kernel.cc
@@ -26,6 +26,7 @@
 #include "host_kernels/kernel_utils.h"
 #include "graph/utils/type_utils.h"
 #include "inc/kernel_factory.h"
+#include "framework/common/types.h"
 
 namespace {
 const int kFillInputSize = 2;
diff --git a/src/ge/host_kernels/identity_kernel.cc b/src/ge/host_kernels/identity_kernel.cc
index 16bd3138..46063ba7 100644
--- a/src/ge/host_kernels/identity_kernel.cc
+++ b/src/ge/host_kernels/identity_kernel.cc
@@ -16,6 +16,7 @@
 
 #include "identity_kernel.h"
 #include "inc/kernel_factory.h"
+#include "framework/common/types.h"
 
 namespace {
 constexpr uint32_t kInputDescIndex = 0;
diff --git a/src/ge/host_kernels/pack_kernel.cc b/src/ge/host_kernels/pack_kernel.cc
index 9b62a582..5999be3c 100644
--- a/src/ge/host_kernels/pack_kernel.cc
+++ b/src/ge/host_kernels/pack_kernel.cc
@@ -28,6 +28,7 @@
 #include "host_kernels/kernel_utils.h"
 #include "graph/utils/type_utils.h"
 #include "inc/kernel_factory.h"
+#include "framework/common/types.h"
 
 namespace {
 const int64_t kShapeItemNumMAX = 2000000000;
diff --git a/src/ge/host_kernels/rank_kernel.cc b/src/ge/host_kernels/rank_kernel.cc
index 7fb92039..1d93418c 100644
--- a/src/ge/host_kernels/rank_kernel.cc
+++ b/src/ge/host_kernels/rank_kernel.cc
@@ -25,6 +25,7 @@
 #include "framework/common/debug/ge_log.h"
 #include "inc/kernel_factory.h"
 #include "omg/omg_inner_types.h"
+#include "framework/common/types.h"
 
 namespace {
 const size_t kRankInputSize = 1;
diff --git a/src/ge/host_kernels/rsqrt_kernel.cc b/src/ge/host_kernels/rsqrt_kernel.cc
index 5184d885..6e9dd6bd 100644
--- a/src/ge/host_kernels/rsqrt_kernel.cc
+++ b/src/ge/host_kernels/rsqrt_kernel.cc
@@ -28,6 +28,7 @@
 #include "host_kernels/kernel_utils.h"
 #include "inc/kernel_factory.h"
 #include "common/math/math_util.h"
+#include "framework/common/types.h"
 
 namespace ge {
 namespace {
diff --git a/src/ge/host_kernels/shape_kernel.cc b/src/ge/host_kernels/shape_kernel.cc
index 2f20fb24..d4069fb0 100644
--- a/src/ge/host_kernels/shape_kernel.cc
+++ b/src/ge/host_kernels/shape_kernel.cc
@@ -23,6 +23,7 @@
 #include "host_kernels/kernel_utils.h"
 #include "graph/passes/pass_utils.h"
 #include "inc/kernel_factory.h"
+#include "framework/common/types.h"
 
 namespace ge {
 namespace {
diff --git a/src/ge/host_kernels/shape_n_kernel.cc b/src/ge/host_kernels/shape_n_kernel.cc
index 33b878cf..ec43b978 100644
--- a/src/ge/host_kernels/shape_n_kernel.cc
+++ b/src/ge/host_kernels/shape_n_kernel.cc
@@ -23,6 +23,7 @@
 #include "host_kernels/kernel_utils.h"
 #include "graph/passes/pass_utils.h"
 #include "inc/kernel_factory.h"
+#include "framework/common/types.h"
 
 namespace ge {
 Status ShapeNKernel::Compute(const NodePtr &node, std::vector<GeTensorPtr> &v_output) {
diff --git a/src/ge/host_kernels/strided_slice_kernel.cc b/src/ge/host_kernels/strided_slice_kernel.cc
index 13c61666..c6684c55 100644
--- a/src/ge/host_kernels/strided_slice_kernel.cc
+++ b/src/ge/host_kernels/strided_slice_kernel.cc
@@ -15,17 +15,12 @@
  */
 
 #include "host_kernels/strided_slice_kernel.h"
-
 #include "common/fp16_t.h"
-#include "common/ge_inner_error_codes.h"
 #include "common/math/math_util.h"
-#include "common/op/ge_op_utils.h"
-#include "external/graph/types.h"
-#include "framework/common/debug/ge_log.h"
+#include "framework/common/types.h"
 #include "graph/utils/type_utils.h"
 #include "host_kernels/kernel_utils.h"
 #include "inc/kernel_factory.h"
-#include <memory>
 
 namespace ge {
 namespace {
@@ -36,16 +31,16 @@ const size_t kStridedSliceBeginIndex = 1;
 const size_t kStridedSliceEndIndex = 2;
 const size_t kStridedSliceStrideIndex = 3;
 const int32_t kDefaultStrideSize = 1;
+const uint32_t kMaskBitLeftUnit = 1;
 const std::set<DataType> kIndexNumberType = {DT_INT32, DT_INT64};
 
-bool IsEllipsisMaskValid(const GeTensorDescPtr &input_desc, const int ellipsis_mask) {
+bool IsEllipsisMaskValid(const GeTensorDescPtr &input_desc, const uint32_t ellipsis_mask) {
   if (ellipsis_mask != 0) {
     auto ellipsis_num = 0;
     auto input_shape = input_desc->GetShape();
-    bool ellipsis_mask_flag = false;
-    for (size_t i = 0; i < input_shape.GetDimNum(); i++) {
-      uint32_t i_temp = static_cast<uint32_t>(i);
-      ellipsis_mask_flag = (static_cast<uint32_t>(ellipsis_mask) & (1 << i_temp));
+    for (size_t i = 0; i < input_shape.GetDimNum(); ++i) {
+      auto i_temp = static_cast<uint32_t>(i);
+      bool ellipsis_mask_flag = (ellipsis_mask) & (kMaskBitLeftUnit << i_temp);
       if (ellipsis_mask_flag) {
         ++ellipsis_num;
       }
@@ -57,6 +52,35 @@ bool IsEllipsisMaskValid(const GeTensorDescPtr &input_desc, const int ellipsis_m
   }
   return true;
 }
+
+void GetOriginStrideVec(const std::vector<ge::ConstGeTensorPtr> &input, vector<int64_t> &orig_begin_vec,
+                        vector<int64_t> &orig_end_vec, vector<int64_t> &orig_stride_vec) {
+  ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex];
+  ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex];
+  ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex];
+
+  auto data_type = begin_tensor->GetTensorDesc().GetDataType();
+  size_t vec_size = begin_tensor->GetData().size() / GetSizeByDataType(data_type);
+  if (data_type == DT_INT32) {
+    const int32_t *begin = reinterpret_cast<const int32_t *>(begin_tensor->GetData().data());
+    const int32_t *end = reinterpret_cast<const int32_t *>(end_tensor->GetData().data());
+    const int32_t *stride = reinterpret_cast<const int32_t *>(stride_tensor->GetData().data());
+    for (size_t i = 0; i < vec_size; ++i) {
+      orig_begin_vec.emplace_back(begin[i]);
+      orig_end_vec.emplace_back(end[i]);
+      orig_stride_vec.emplace_back(stride[i]);
+    }
+  } else {
+    const int64_t *begin = reinterpret_cast<const int64_t *>(begin_tensor->GetData().data());
+    const int64_t *end = reinterpret_cast<const int64_t *>(end_tensor->GetData().data());
+    const int64_t *stride = reinterpret_cast<const int64_t *>(stride_tensor->GetData().data());
+    for (size_t i = 0; i < vec_size; ++i) {
+      orig_begin_vec.emplace_back(begin[i]);
+      orig_end_vec.emplace_back(end[i]);
+      orig_stride_vec.emplace_back(stride[i]);
+    }
+  }
+}
 }  // namespace
 Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector<ge::ConstGeTensorPtr> &input,
                                    vector<ge::GeTensorPtr> &v_output) {
@@ -133,7 +157,7 @@ Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr) {
   }
   return SUCCESS;
 }
-Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const {
+Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &input) {
   if (input.size() != kStridedSliceInputSize) {
     GELOGE(PARAM_INVALID, "The number of input for strided slice must be %zu.", kStridedSliceInputSize);
     return PARAM_INVALID;
@@ -170,9 +194,9 @@ Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &
     return PARAM_INVALID;
   }
   size_t weight0_size = weight0->GetData().size() / x_data_size;
-  size_t begin_data_size = begin_tensor->GetData().size() / sizeof(int32_t);
-  size_t end_data_size = end_tensor->GetData().size() / sizeof(int32_t);
-  size_t stride_data_size = stride_tensor->GetData().size() / sizeof(int32_t);
+  size_t begin_data_size = begin_tensor->GetData().size();
+  size_t end_data_size = end_tensor->GetData().size();
+  size_t stride_data_size = stride_tensor->GetData().size();
   if ((weight0_size == 0) || (begin_data_size == 0) || (end_data_size == 0) || (stride_data_size == 0)) {
     GELOGW("Data size of inputs is 0.");
     return PARAM_INVALID;
@@ -182,7 +206,6 @@ Status StridedSliceKernel::CheckInputParam(const std::vector<ConstGeTensorPtr> &
     GELOGW("The sizes of begin, end and stride is not supported.");
     return PARAM_INVALID;
   }
-
   return SUCCESS;
 }
 
@@ -191,8 +214,6 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr
                                               std::vector<int64_t> &output_dims, std::vector<int64_t> &stride_vec) {
   ConstGeTensorPtr weight0 = input[kStridedSliceInputIndex];
   ConstGeTensorPtr begin_tensor = input[kStridedSliceBeginIndex];
-  ConstGeTensorPtr end_tensor = input[kStridedSliceEndIndex];
-  ConstGeTensorPtr stride_tensor = input[kStridedSliceStrideIndex];
 
   const GeShape x_shape = weight0->GetTensorDesc().GetShape();
   auto x_dims = x_shape.GetDims();
@@ -200,15 +221,13 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr
   // handle new_axis_mask
   ExpandDimsWithNewAxis(begin_tensor, x_dims_num, x_dims);
 
-  const int32_t *begin = reinterpret_cast<const int32_t *>(begin_tensor->GetData().data());
-  const int32_t *end = reinterpret_cast<const int32_t *>(end_tensor->GetData().data());
-  const int32_t *stride = reinterpret_cast<const int32_t *>(stride_tensor->GetData().data());
-  auto begin_dim_num = begin_tensor->GetData().size() / sizeof(int32_t);
+  vector<int64_t> orig_begin_vec, orig_end_vec, orig_stride_vec;
+  GetOriginStrideVec(input, orig_begin_vec, orig_end_vec, orig_stride_vec);
+  auto begin_dim_num = orig_begin_vec.size();
   auto min_dim = x_dims_num > begin_dim_num ? begin_dim_num : x_dims_num;
   for (size_t i = 0; i < x_dims.size(); ++i) {
-    auto i_temp = static_cast<uint64_t>(i);
-    bool new_axis_mask_flag =
-      (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp));
+    auto i_temp = static_cast<uint32_t>(i);
+    bool new_axis_mask_flag = (attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK) & (kMaskBitLeftUnit << i_temp));
     if (new_axis_mask_flag) {
       output_dims.push_back(1);
       input_dims.push_back(1);
@@ -221,9 +240,9 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr
     int64_t end_i = 0;
     int64_t stride_i = 1;
     if (i < min_dim) {
-      begin_i = begin[i];
-      end_i = end[i];
-      stride_i = stride[i];
+      begin_i = orig_begin_vec[i];
+      end_i = orig_end_vec[i];
+      stride_i = orig_stride_vec[i];
     } else {
       begin_i = 0;
       end_i = x_dims.at(i);
@@ -247,28 +266,27 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr
   }
   return SUCCESS;
 }
+
 void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num,
                                                vector<int64_t> &x_dims) {
   auto begin_data_type_size = GetSizeByDataType(begin_tensor->GetTensorDesc().GetDataType());
   size_t begin_vec_size = begin_tensor->GetData().size() / begin_data_type_size;
   auto final_dim_num = x_dims_num < begin_vec_size ? begin_vec_size : x_dims_num;
   for (size_t i = 0; i < final_dim_num; i++) {
-    auto i_temp = static_cast<uint64_t>(i);
-    bool new_axis_mask_flag =
-      (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK)) & (1 << i_temp));
+    auto i_temp = static_cast<uint32_t>(i);
+    bool new_axis_mask_flag = (attr_value_map_.at(STRIDE_SLICE_ATTR_NEW_AXIS_MASK) & (kMaskBitLeftUnit << i_temp));
     if (new_axis_mask_flag) {
       x_dims.insert(x_dims.begin() + i, 1);
     }
   }
 }
+
 Status StridedSliceKernel::MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const {
-  uint64_t i_temp = static_cast<uint64_t>(i);
-  bool begin_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_BEGIN_MASK)) & (1 << i_temp));
-  bool end_mask_flag = (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK)) & (1 << i_temp));
-  bool ellipsis_mask_flag =
-    (static_cast<uint64_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK)) & (1 << i_temp));
-  bool shrink_mask_flag =
-    (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << i_temp));
+  auto i_temp = static_cast<uint32_t>(i);
+  bool begin_mask_flag = (attr_value_map_.at(STRIDE_SLICE_ATTR_BEGIN_MASK) & (kMaskBitLeftUnit << i_temp));
+  bool end_mask_flag = (attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK) & (kMaskBitLeftUnit << i_temp));
+  bool ellipsis_mask_flag = (attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK) & (kMaskBitLeftUnit << i_temp));
+  bool shrink_mask_flag = (attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK) & (kMaskBitLeftUnit << i_temp));
   if (shrink_mask_flag) {
     begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i);
     FMK_INT32_ADDCHECK(begin_i, kNumOne)
@@ -291,8 +309,9 @@ Status StridedSliceKernel::MaskCal(const size_t i, int64_t &begin_i, int64_t &en
   }
   return SUCCESS;
 }
+
 Status StridedSliceKernel::StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
-                                     int64_t &dim_final) const {
+                                     int64_t &dim_final) {
   if (stride_i == 0) {
     stride_i = kDefaultStrideSize;
   } else if (stride_i < 0) {
@@ -312,15 +331,17 @@ Status StridedSliceKernel::StrideCal(const int64_t x_dims_i, int64_t &begin_i, i
   }
   return SUCCESS;
 }
+
 void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims,
                                        vector<int64_t> &v_dims) {
   for (uint32_t k = 0; k < dims_size; k++) {
-    bool shrink_mask_i = (static_cast<uint32_t>(attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK)) & (1 << k));
+    bool shrink_mask_i = (attr_value_map_.at(STRIDE_SLICE_ATTR_SHRINK_AXIS_MASK) & (kMaskBitLeftUnit << k));
     if (shrink_mask_i) {
       continue;
     }
     v_dims.push_back(output_dims[k]);
   }
 }
+
 REGISTER_KERNEL(STRIDEDSLICE, StridedSliceKernel);
 }  // namespace ge
diff --git a/src/ge/host_kernels/strided_slice_kernel.h b/src/ge/host_kernels/strided_slice_kernel.h
index 5d130cd7..65e572bf 100644
--- a/src/ge/host_kernels/strided_slice_kernel.h
+++ b/src/ge/host_kernels/strided_slice_kernel.h
@@ -28,13 +28,13 @@ class StridedSliceKernel : public Kernel {
 
  private:
   Status CheckAndGetAttr(const OpDescPtr &attr);
-  Status CheckInputParam(const std::vector<ConstGeTensorPtr> &input) const;
+  static Status CheckInputParam(const std::vector<ConstGeTensorPtr> &input);
   Status InitParamWithAttrs(const std::vector<ConstGeTensorPtr> &input, std::vector<int64_t> &input_dims,
                             std::vector<int64_t> &begin_vec, std::vector<int64_t> &output_dims,
                             std::vector<int64_t> &stride_vec);
   Status MaskCal(const size_t i, int64_t &begin_i, int64_t &end_i, int64_t &dim_i) const;
-  Status StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
-                   int64_t &dim_final) const;
+  static Status StrideCal(const int64_t x_dims_i, int64_t &begin_i, int64_t &end_i, int64_t &stride_i,
+                          int64_t &dim_final);
   void ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num, vector<int64_t> &x_dims);
 
   void GetOutputDims(uint32_t dims_size, const std::vector<int64_t> &output_dims, vector<int64_t> &v_dims);
diff --git a/src/ge/hybrid/common/npu_memory_allocator.cc b/src/ge/hybrid/common/npu_memory_allocator.cc
index cbb556e2..da365bf7 100644
--- a/src/ge/hybrid/common/npu_memory_allocator.cc
+++ b/src/ge/hybrid/common/npu_memory_allocator.cc
@@ -117,4 +117,4 @@ void NpuMemoryAllocator::DestroyAllocator() {
   allocators_.erase(device_id);
 }
 }  // namespace hybrid
-}  // namespace ge
\ No newline at end of file
+}  // namespace ge
diff --git a/src/ge/hybrid/executor/hybrid_model_async_executor.cc b/src/ge/hybrid/executor/hybrid_model_async_executor.cc
index 7f650017..5b08ed2f 100644
--- a/src/ge/hybrid/executor/hybrid_model_async_executor.cc
+++ b/src/ge/hybrid/executor/hybrid_model_async_executor.cc
@@ -18,6 +18,7 @@
 #include "graph/load/new_model_manager/model_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
+#include "graph/ge_context.h"
 #include "omm/csa_interact.h"
 
 namespace ge {
@@ -51,7 +52,10 @@ Status HybridModelAsyncExecutor::Start(const std::shared_ptr<ModelListener> &lis
 
   run_flag_ = true;
   listener_ = listener;
-  future_ = std::async([&]() -> Status { return RunInternal(); });
+  future_ = std::async([&]() -> Status {
+    GetContext().SetSessionId(executor_->GetContext()->session_id);
+    return RunInternal();
+  });
 
   GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, "Failed to start.");
   GELOGD("HybridModelExecutor::Start successfully");
diff --git a/src/ge/hybrid/executor/hybrid_model_executor.cc b/src/ge/hybrid/executor/hybrid_model_executor.cc
index 718801b4..d8e06b6c 100644
--- a/src/ge/hybrid/executor/hybrid_model_executor.cc
+++ b/src/ge/hybrid/executor/hybrid_model_executor.cc
@@ -102,7 +102,7 @@ Status HybridModelExecutor::InitExecutionContext() {
   const char *profiling_level = std::getenv(kEnvProfilingLevel);
   if (profiling_level != nullptr) {
     context_.profiling_level = std::strtol(profiling_level, nullptr, kIntBase);
-    GELOGD("Got profiling level = %d", context_.profiling_level);
+    GELOGD("Got profiling level = %ld", context_.profiling_level);
     if (context_.profiling_level > 0) {
       context_.profiler.reset(new (std::nothrow) HybridProfiler());
       GE_CHECK_NOTNULL(context_.profiler);
diff --git a/src/ge/hybrid/executor/rt_callback_manager.cc b/src/ge/hybrid/executor/rt_callback_manager.cc
index c1c98f73..03681f13 100644
--- a/src/ge/hybrid/executor/rt_callback_manager.cc
+++ b/src/ge/hybrid/executor/rt_callback_manager.cc
@@ -27,10 +27,17 @@ Status CallbackManager::RegisterCallback(rtCallback_t callback, void *user_data)
   GELOGD("To register callback");
   rtEvent_t event = nullptr;
   GE_CHK_RT_RET(rtEventCreate(&event));
-  GE_CHK_RT_RET(rtEventRecord(event, stream_));
+  auto rt_ret = rtEventRecord(event, stream_);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Failed to invoke rtEventRecord, error code = %d", rt_ret);
+    (void)rtEventDestroy(event);
+    return RT_FAILED;
+  }
+
   auto cb = std::pair<rtCallback_t, void *>(callback, user_data);
   auto entry = std::pair<rtEvent_t, std::pair<rtCallback_t, void *>>(event, std::move(cb));
   if (!callback_queue_.Push(entry)) {
+    (void)rtEventDestroy(event);
     return INTERNAL_ERROR;
   }
 
diff --git a/src/ge/hybrid/executor/subgraph_executor.cc b/src/ge/hybrid/executor/subgraph_executor.cc
index c76bb209..1a8de55f 100644
--- a/src/ge/hybrid/executor/subgraph_executor.cc
+++ b/src/ge/hybrid/executor/subgraph_executor.cc
@@ -15,6 +15,7 @@
  */
 
 #include "hybrid/executor/subgraph_executor.h"
+#include "graph/ge_context.h"
 #include "hybrid/executor/worker/task_compile_engine.h"
 #include "hybrid/executor/worker/execution_engine.h"
 #include "hybrid/node_executor/node_executor.h"
@@ -200,6 +201,7 @@ Status SubgraphExecutor::PrepareNodes() {
     // only do shape inference and compilation for nodes with dynamic shapes.
     if (node_item.is_dynamic) {
       auto prepare_future = pre_run_pool_.commit([this, p_node_state]() -> Status {
+        GetContext().SetSessionId(context_->session_id);
         GE_CHK_STATUS_RET_NOLOG(InferShape(shape_inference_engine_.get(), *p_node_state));
         return PrepareForExecution(context_, *p_node_state);
       });
@@ -285,6 +287,7 @@ Status SubgraphExecutor::LaunchTasks() {
 Status SubgraphExecutor::ScheduleTasks() {
   GELOGD("[%s] Start to schedule prepare workers.", graph_item_->GetName().c_str());
   auto prepare_future = std::async([&]() -> Status {
+    GetContext().SetSessionId(context_->session_id);
     auto ret = PrepareNodes();
     ready_queue_.Push(nullptr);
     return ret;
diff --git a/src/ge/hybrid/executor/worker/execution_engine.cc b/src/ge/hybrid/executor/worker/execution_engine.cc
index 1eb73e41..6ccf311f 100644
--- a/src/ge/hybrid/executor/worker/execution_engine.cc
+++ b/src/ge/hybrid/executor/worker/execution_engine.cc
@@ -18,10 +18,14 @@
 #include "graph/runtime_inference_context.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/tensor_adapter.h"
+#include "graph/debug/ge_attr_define.h"
 #include "hybrid/node_executor/node_executor.h"
 #include "common/dump/dump_manager.h"
 #include "common/dump/dump_op.h"
 #include "common/types.h"
+#include "common/ge_types.h"
+#include "common/profiling/profiling_manager.h"
+#include "runtime/base.h"
 
 namespace ge {
 namespace hybrid {
@@ -63,6 +67,10 @@ class NodeDoneCallback {
  private:
   Status PrepareConstInputs(const NodeItem &node_item);
   Status DumpDynamicNode();
+  Status ProfilingReport();
+  Status GetGraphDescInfo(const NodePtr node, const HybridModel *model,
+                          std::vector<ComputeGraphDescInfo> &compute_graph_info);
+  Status GetTaskDescInfo(const NodePtr node, const HybridModel *model, std::vector<TaskDescInfo> &task_desc_info);
   GraphExecutionContext *graph_context_;
   std::shared_ptr<TaskContext> context_;
   DumpOp dump_op_;
@@ -99,8 +107,7 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) {
            output_tensor->GetSize());
     GE_CHK_RT_RET(
       rtMemcpy(host_buffer.data(), tensor_size, output_tensor->GetData(), tensor_size, RT_MEMCPY_DEVICE_TO_HOST));
-    tensor.SetData(host_buffer);
-
+    tensor.SetData(std::move(host_buffer));
     string session_id = std::to_string(context_->GetSessionId());
     RuntimeInferenceContext *runtime_infer_ctx = nullptr;
     GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(session_id, &runtime_infer_ctx),
@@ -118,6 +125,119 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) {
   return SUCCESS;
 }
 
+Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel *model,
+                                         std::vector<TaskDescInfo> &task_desc_info) {
+  GE_CHECK_NOTNULL(node);
+  GE_CHECK_NOTNULL(model);
+
+  GELOGD("GetTaskDescInfo of node [%s] start.", node->GetName().c_str());
+  auto op_desc = node->GetOpDesc();
+  std::string op_name = op_desc->GetName();
+  std::string dynamic_model_name = model->GetModelName();
+
+  uint32_t task_id = 0;
+  uint32_t stream_id = 0;
+  if (rtGetTaskIdAndStreamID(&task_id, &stream_id) != RT_ERROR_NONE) {
+    GELOGE(PARAM_INVALID, "Get task_id and stream_id failed.");
+    return PARAM_INVALID;
+  }
+
+  TaskDescInfo tmp_task_desc_info;
+  tmp_task_desc_info.model_name = dynamic_model_name;
+  tmp_task_desc_info.op_name = op_name;
+  tmp_task_desc_info.block_dim = 0;
+  auto task_defs = model->GetTaskDefs(node);
+  if (task_defs != nullptr && (*task_defs).size() > 0) {
+    const auto &task_def = (*task_defs)[0];
+    tmp_task_desc_info.block_dim = task_def.kernel().block_dim();
+  }
+  tmp_task_desc_info.task_id = task_id;
+  tmp_task_desc_info.stream_id = stream_id;
+  GELOGD("GetTaskDescInfo of node [%s] end, task_id[%u], stream_id[%u]", node->GetName().c_str(), task_id, stream_id);
+  task_desc_info.emplace_back(tmp_task_desc_info);
+  return SUCCESS;
+}
+
+Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel *model,
+                                          std::vector<ComputeGraphDescInfo> &compute_graph_info) {
+  GE_CHECK_NOTNULL(node);
+  GE_CHECK_NOTNULL(model);
+
+  GELOGD("GetComputeGraphInfo of node [%s] start.", node->GetName().c_str());
+
+  std::string dynamic_model_name = model->GetModelName();
+  auto op_desc = node->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(PARAM_INVALID, "op_desc is nullptr.");
+    return PARAM_INVALID;
+  }
+
+  auto op_mode = static_cast<uint32_t>(domi::ImplyType::INVALID);
+  if (AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, op_mode) &&
+      op_mode == static_cast<uint32_t>(domi::ImplyType::TVM)) {
+    ComputeGraphDescInfo tmp_compute_graph_info;
+    tmp_compute_graph_info.model_name = dynamic_model_name;
+    tmp_compute_graph_info.op_name = op_desc->GetName();
+    tmp_compute_graph_info.op_type = op_desc->GetType();
+
+    for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) {
+      GeTensorDescPtr input_desc = op_desc->MutableInputDesc(i);
+      if (input_desc == nullptr) {
+        continue;
+      }
+      tmp_compute_graph_info.input_format.emplace_back(input_desc->GetFormat());
+      tmp_compute_graph_info.input_shape.emplace_back(input_desc->GetShape().GetDims());
+      tmp_compute_graph_info.input_data_type.emplace_back(input_desc->GetDataType());
+    }
+
+    for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) {
+      GeTensorDesc output_desc = op_desc->GetOutputDesc(j);
+      tmp_compute_graph_info.output_format.emplace_back(output_desc.GetFormat());
+      tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims());
+      tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType());
+    }
+    compute_graph_info.emplace_back(tmp_compute_graph_info);
+    GELOGD("GetComputeGraphInfo of node [%s] end.", node->GetName().c_str());
+  }
+  return SUCCESS;
+}
+
+Status NodeDoneCallback::ProfilingReport() {
+  auto node = context_->GetNodeItem().node;
+  if (node == nullptr) {
+    GELOGE(PARAM_INVALID, "Get node is nullptr");
+    return PARAM_INVALID;
+  }
+
+  const auto &op_type = node->GetType();
+  if (op_type == PARTITIONEDCALL) {
+    return SUCCESS;
+  }
+
+  GE_CHECK_NOTNULL(graph_context_);
+  const HybridModel *model = graph_context_->model;
+  GE_CHECK_NOTNULL(model);
+
+  GELOGD("ProfilingReport of node [%s] model [%s] start.", node->GetName().c_str(), model->GetModelName().c_str());
+  std::vector<TaskDescInfo> task_desc_info;
+  TaskDescInfo tmp_task_desc_info;
+  auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info);
+  if (profiling_ret != RT_ERROR_NONE) {
+    GELOGE(profiling_ret, "Get task info of node[%s] failed.", node->GetName().c_str());
+    return profiling_ret;
+  }
+
+  std::vector<ComputeGraphDescInfo> compute_graph_info;
+  profiling_ret = GetGraphDescInfo(node, model, compute_graph_info);
+  if (profiling_ret != RT_ERROR_NONE) {
+    GELOGE(profiling_ret, "Get graph info of node[%s] failed.", node->GetName().c_str());
+    return profiling_ret;
+  }
+
+  ProfilingManager::Instance().ReportProfilingData(task_desc_info, compute_graph_info);
+  return SUCCESS;
+}
+
 Status NodeDoneCallback::DumpDynamicNode() {
   auto node = context_->GetNodeItem().node;
   if (node == nullptr) {
@@ -191,6 +311,10 @@ Status NodeDoneCallback::OnNodeDone() {
     GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node");
   }
 
+  if (ProfilingManager::Instance().ProfilingModelExecuteOn()) {
+    GE_CHK_STATUS_RET(ProfilingReport(), "Report node[%s] to profiling failed.", node_item.NodeName().c_str());
+  }
+
   // release inputs
   for (int i = 0; i < context_->NumInputs(); ++i) {
     context_->ReleaseInput(i);
@@ -299,6 +423,11 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const
       continue;
     }
 
+    if (input_tensor->GetData() == nullptr) {
+      GELOGD("[%s] Skipping null input, index = %d", task_context.GetNodeName(), i);
+      continue;
+    }
+
     int64_t expected_size;
     GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, expected_size));
     GELOGD("[%s] Input[%d] expects [%ld] bytes.", task_context.GetNodeName(), i, expected_size);
diff --git a/src/ge/hybrid/executor/worker/shape_inference_engine.cc b/src/ge/hybrid/executor/worker/shape_inference_engine.cc
index 49a29259..cc83ccfb 100644
--- a/src/ge/hybrid/executor/worker/shape_inference_engine.cc
+++ b/src/ge/hybrid/executor/worker/shape_inference_engine.cc
@@ -29,6 +29,10 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) {
   GE_CHK_STATUS_RET_NOLOG(node_state.GetShapeInferenceState().AwaitShapesReady(*execution_context_));
 
   auto &node_item = *node_state.GetNodeItem();
+
+  // Wait for "const input nodes" if node's shape inference function requires any.
+  // Even if output shape is static, there are cases that the const-input will be used in OpTiling and Execution
+  GE_CHK_STATUS_RET_NOLOG(AwaitDependentNodes(node_state));
   if (node_item.is_output_shape_static) {
     return SUCCESS;
   }
@@ -51,9 +55,6 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) {
     }
   }
 
-  // Wait for "const input nodes" if node's shape inference function requires any.
-  GE_CHK_STATUS_RET_NOLOG(AwaitDependentNodes(node_state));
-
   // Do shape inference
   GELOGD("[%s] Start to invoke InferShapeAndType", node_item.NodeName().c_str());
   {
diff --git a/src/ge/hybrid/model/hybrid_model.cc b/src/ge/hybrid/model/hybrid_model.cc
index 18db28cb..4d3b1462 100644
--- a/src/ge/hybrid/model/hybrid_model.cc
+++ b/src/ge/hybrid/model/hybrid_model.cc
@@ -16,8 +16,8 @@
 
 #include "hybrid_model.h"
 #include <vector>
-#include "graph/load/new_model_manager/model_utils.h"
 #include "graph/debug/ge_attr_define.h"
+#include "graph/load/new_model_manager/model_utils.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/tensor_utils.h"
@@ -50,13 +50,16 @@ TensorValue *HybridModel::GetVariable(const string &name) const {
 }
 
 NodePtr HybridModel::GetVariableNode(const string &name) const {
-  auto it = variable_nodes_.find(name);
-  if (it == variable_nodes_.end()) {
-    GELOGI("Failed to get variable node by name = [%s]", name.c_str());
-    return nullptr;
+  auto it = device_variable_nodes_.find(name);
+  if (it != device_variable_nodes_.end()) {
+    return it->second;
   }
-
-  return it->second;
+  auto host_find = host_variable_nodes_.find(name);
+  if (host_find != host_variable_nodes_.end()) {
+    return host_find->second;
+  }
+  GELOGI("Failed to get variable node by name = [%s]", name.c_str());
+  return nullptr;
 }
 
 const std::vector<domi::TaskDef> *HybridModel::GetTaskDefs(const NodePtr &node) const {
diff --git a/src/ge/hybrid/model/hybrid_model.h b/src/ge/hybrid/model/hybrid_model.h
index 668b5fd7..6f9ce9f7 100644
--- a/src/ge/hybrid/model/hybrid_model.h
+++ b/src/ge/hybrid/model/hybrid_model.h
@@ -79,7 +79,8 @@ class HybridModel {
   GeRootModelPtr ge_root_model_;
   std::map<uint32_t, NodeItem *> input_nodes_;
   std::map<std::string, NodePtr> constant_op_nodes_;
-  std::map<std::string, NodePtr> variable_nodes_;
+  std::map<std::string, NodePtr> device_variable_nodes_;
+  std::map<std::string, NodePtr> host_variable_nodes_;
   std::map<std::string, std::unique_ptr<TensorValue>> variable_tensors_;
   std::map<NodePtr, std::vector<domi::TaskDef>> task_defs_;
   std::map<NodePtr, GeModelPtr> known_shape_sub_models_;
diff --git a/src/ge/hybrid/model/hybrid_model_builder.cc b/src/ge/hybrid/model/hybrid_model_builder.cc
index 0671990c..2c6227fb 100644
--- a/src/ge/hybrid/model/hybrid_model_builder.cc
+++ b/src/ge/hybrid/model/hybrid_model_builder.cc
@@ -17,10 +17,12 @@
 #include "hybrid/model/hybrid_model_builder.h"
 #include "common/math/math_util.h"
 #include "graph/ge_context.h"
+#include "graph/build/memory/var_mem_assign_util.h"
 #include "graph/utils/node_utils.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/load/new_model_manager/model_utils.h"
 #include "graph/manager/graph_var_manager.h"
+#include "graph/manager/host_mem_manager.h"
 #include "graph/manager/trans_var_data_utils.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/type_utils.h"
@@ -180,6 +182,8 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n
 Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const std::vector<string> &dependencies) {
   std::set<NodePtr> dependent_input_nodes;
   auto &ge_node = node_item.node;
+  bool is_hccl_op =
+    NodeExecutorManager::GetInstance().ResolveExecutorType(*ge_node) == NodeExecutorManager::ExecutorType::HCCL;
 
   // The input tensors become valid after computation is done for parent nodes of type DEPEND_COMPUTE.
   // Wait for these parent nodes before execution.
@@ -194,7 +198,12 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s
     auto src_node_item = MutableNodeItem(src_node);
     GE_CHECK_NOTNULL(src_node_item);
 
-    if (src_node_item->shape_inference_type == DEPEND_COMPUTE) {
+    if (is_hccl_op) {
+      GELOGD("[%s] Add input data dependent node [%s] due to engine type is HCCL", node_item.NodeName().c_str(),
+             src_node_item->NodeName().c_str());
+      src_node_item->has_observer = true;
+      node_item.dependents_for_execution.emplace_back(src_node);
+    } else if (src_node_item->shape_inference_type == DEPEND_COMPUTE) {
       GELOGD("[%s] Add input data dependent node [%s] due to inference type = DEPEND_COMPUTE",
              node_item.NodeName().c_str(), src_node_item->NodeName().c_str());
 
@@ -369,7 +378,10 @@ Status HybridModelBuilder::MergeInputNodes(ComputeGraph &graph) {
 Status HybridModelBuilder::MergeNetOutputNode(ComputeGraph &graph) {
   const auto &parent_node = graph.GetParentNode();
   const NodePtr &net_output_node = graph.FindFirstNodeMatchType(NETOUTPUT);
-  GE_CHECK_NOTNULL(net_output_node);
+  if (net_output_node == nullptr) {
+    GELOGD("Graph has no netoutput no need to merge.");
+    return SUCCESS;
+  }
   const auto &net_output_desc = net_output_node->GetOpDesc();
   GE_CHECK_NOTNULL(net_output_desc);
 
@@ -441,17 +453,15 @@ Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraph &root_graph, ComputeGrap
       continue;
     }
 
-    bool is_unknown_shape = false;
-    GE_CHK_GRAPH_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown_shape),
-                            "Failed to invoke GetNodeUnknownShapeStatus.");
+    auto subgraph = NodeUtils::GetSubgraph(*node, kSubgraphIndex);
+    GE_CHECK_NOTNULL(subgraph);
+    bool is_unknown_shape = subgraph->GetGraphUnknownFlag();
     if (!is_unknown_shape) {
       merged_graph->AddNode(node);
       GELOGD("[%s] Known shape partitioned call added to merged graph.", op_desc->GetName().c_str());
       continue;
     }
 
-    auto subgraph = NodeUtils::GetSubgraph(*node, kSubgraphIndex);
-    GE_CHECK_NOTNULL(subgraph);
     GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraph(root_graph, *merged_graph, *subgraph), "[%s] Failed to merge subgraph.",
                             subgraph->GetName().c_str());
   }
@@ -484,20 +494,10 @@ Status HybridModelBuilder::UnfoldSubgraph(ComputeGraph &root_graph, ComputeGraph
     if (sub_op_type == DATA_TYPE || sub_op_type == NETOUTPUT) {
       continue;
     }
-
-    if (sub_op_type == CONSTANT || sub_op_type == VARIABLE) {
-      GELOGE(INTERNAL_ERROR, "Unexpected node in unknown subgraph. type = %s, node = %s::%s", sub_op_type.c_str(),
-             sub_graph.GetName().c_str(), sub_node->GetName().c_str());
-      return INTERNAL_ERROR;
-    }
-
     if (sub_op_type == PARTITIONEDCALL) {
-      bool is_unknown_shape = false;
-      GE_CHK_GRAPH_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*sub_node, is_unknown_shape),
-                              "[%s] Failed to invoke GetNodeUnknownShapeStatus.", sub_node->GetName().c_str());
-      if (is_unknown_shape) {
-        auto sub_sub_graph = NodeUtils::GetSubgraph(*sub_node, kSubgraphIndex);
-        GE_CHECK_NOTNULL(sub_sub_graph);
+      auto sub_sub_graph = NodeUtils::GetSubgraph(*sub_node, kSubgraphIndex);
+      GE_CHECK_NOTNULL(sub_sub_graph);
+      if (sub_sub_graph->GetGraphUnknownFlag()) {
         GE_CHK_STATUS_RET(UnfoldSubgraph(root_graph, parent_graph, *sub_sub_graph), "[%s] Failed to merge subgraph",
                           sub_sub_graph->GetName().c_str());
         continue;
@@ -668,6 +668,19 @@ Status HybridModelBuilder::AssignUninitializedConstantOps() {
     }
   }
 
+  for (auto &it : hybrid_model_.device_variable_nodes_) {
+    const string &var_name = it.first;
+    const NodePtr &var_node = it.second;
+    auto tensor_desc = var_node->GetOpDesc()->MutableOutputDesc(0);
+    if (!var_manager_->IsVarExist(var_name, *tensor_desc)) {
+      // allocate constant
+      GELOGD("[%s] Constant not allocated during graph building. now allocate it.", var_name.c_str());
+      GE_CHK_STATUS_RET(var_manager_->AssignVarMem(var_name, *tensor_desc, RT_MEMORY_HBM));
+      GE_CHK_STATUS_RET(VarMemAssignUtil::AssignData2Fp32Var(var_node, runtime_param_.session_id))
+      GE_CHK_STATUS_RET(var_manager_->SetAllocatedGraphId(var_name, runtime_param_.graph_id));
+    }
+  }
+
   return SUCCESS;
 }
 
@@ -675,28 +688,32 @@ Status HybridModelBuilder::InitConstantOps() {
   for (auto &it : hybrid_model_.constant_op_nodes_) {
     const string &var_name = it.first;
     const NodePtr &var_node = it.second;
-    std::unique_ptr<TensorValue> var_tensor;
-
-    GE_CHK_STATUS_RET_NOLOG(VarNodeToTensor(var_node, var_tensor));
-    GELOGD("Init const op tensor. name = %s, size = %ld", var_name.c_str(), var_tensor->GetSize());
-    var_tensor->SetName("ConstOp_" + var_name);
-
     auto op_desc = var_node->GetOpDesc();
     auto v_weights = ModelUtils::GetWeights(op_desc);
-    auto v_output_size = var_tensor->GetSize();
-    auto v_output_addr = var_tensor->MutableData();
-
     auto *ge_tensor = const_cast<GeTensor *>(v_weights[0].get());
-    if (ge_tensor->GetData().size() > 0) {
-      GE_CHK_STATUS_RET_NOLOG(HandleDtString(*ge_tensor, v_output_addr));
-
-      GELOGI("[IMAS]InitConstant memcpy graph_%u type[V] name[%s] output[%d] memaddr[%p] mem_size[%zu] datasize[%zu]",
-             runtime_param_.graph_id, op_desc->GetName().c_str(), 0, v_output_addr, v_output_size,
-             ge_tensor->GetData().size());
-      GE_CHK_RT_RET(rtMemcpy(v_output_addr, v_output_size, ge_tensor->GetData().data(), ge_tensor->GetData().size(),
-                             RT_MEMCPY_HOST_TO_DEVICE));
+
+    std::unique_ptr<TensorValue> var_tensor;
+    if (GetContext().GetHostExecFlag()) {
+      auto buffer = ge_tensor->MutableData();
+      GELOGD("Init tensor with host constant. size = %zu", buffer.GetSize());
+      var_tensor.reset(new (std::nothrow) TensorValue(buffer.GetData(), buffer.GetSize()));
     } else {
-      GELOGI("[%s] Const op has no weight data.", op_desc->GetName().c_str());
+      GE_CHK_STATUS_RET_NOLOG(VarNodeToTensor(var_node, var_tensor));
+      GELOGD("Init const op tensor. name = %s, size = %ld", var_name.c_str(), var_tensor->GetSize());
+      var_tensor->SetName("ConstOp_" + var_name);
+      auto v_output_size = var_tensor->GetSize();
+      auto v_output_addr = var_tensor->MutableData();
+      if (ge_tensor->GetData().size() > 0) {
+        GE_CHK_STATUS_RET_NOLOG(HandleDtString(*ge_tensor, v_output_addr));
+
+        GELOGI("[IMAS]InitConstant memcpy graph_%u type[V] name[%s] output[%d] memaddr[%p] mem_size[%zu] datasize[%zu]",
+               runtime_param_.graph_id, op_desc->GetName().c_str(), 0, v_output_addr, v_output_size,
+               ge_tensor->GetData().size());
+        GE_CHK_RT_RET(rtMemcpy(v_output_addr, v_output_size, ge_tensor->GetData().data(), ge_tensor->GetData().size(),
+                               RT_MEMCPY_HOST_TO_DEVICE));
+      } else {
+        GELOGI("[%s] Const op has no weight data.", op_desc->GetName().c_str());
+      }
     }
 
     hybrid_model_.variable_tensors_.emplace(var_name, std::move(var_tensor));
@@ -706,7 +723,7 @@ Status HybridModelBuilder::InitConstantOps() {
 }
 
 Status HybridModelBuilder::InitVariableTensors() {
-  for (auto &it : hybrid_model_.variable_nodes_) {
+  for (auto &it : hybrid_model_.device_variable_nodes_) {
     string var_name = it.first;
     NodePtr &var_node = it.second;
     std::unique_ptr<TensorValue> tensor;
@@ -717,6 +734,27 @@ Status HybridModelBuilder::InitVariableTensors() {
     hybrid_model_.variable_tensors_.emplace(var_name, std::move(tensor));
   }
 
+  for (const auto &it : hybrid_model_.host_variable_nodes_) {
+    auto op_desc = it.second->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+    GeTensorDesc output_tensor = op_desc->GetOutputDesc(0);
+    int64_t tensor_size = 0;
+    if (TensorUtils::CalcTensorMemSize(output_tensor.GetShape(), output_tensor.GetFormat(), output_tensor.GetDataType(),
+                                       tensor_size) != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Calculate variable size failed, node name:%s", it.first.c_str());
+      return INTERNAL_ERROR;
+    }
+    SharedMemInfo mem_info(it.first, tensor_size);
+    if (HostMemManager::Instance().MallocSharedMemory(mem_info) != SUCCESS) {
+      GELOGE(GE_GRAPH_MALLOC_FAILED, "Host variable [%s] malloc failed.", it.first.c_str());
+      return GE_GRAPH_MALLOC_FAILED;
+    }
+    GELOGD("Host variable [%s] malloc success.", it.first.c_str());
+
+    std::unique_ptr<TensorValue> tensor(new (std::nothrow) TensorValue(mem_info.host_address, tensor_size));
+    hybrid_model_.variable_tensors_.emplace(it.first, std::move(tensor));
+  }
+
   return SUCCESS;
 }
 
@@ -837,7 +875,13 @@ Status HybridModelBuilder::IndexSpecialNodes() {
     auto op_type = node->GetType();
     GELOGD("node name = %s, node type = %s", node->GetName().c_str(), node->GetType().c_str());
     if (op_type == VARIABLE) {
-      hybrid_model_.variable_nodes_.emplace(node->GetName(), node);
+      string placement;
+      (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_VARIABLE_PLACEMENT, placement);
+      if (placement == "host") {
+        hybrid_model_.host_variable_nodes_.emplace(node->GetName(), node);
+      } else {
+        hybrid_model_.device_variable_nodes_.emplace(node->GetName(), node);
+      }
     } else if (op_type == CONSTANTOP) {
       hybrid_model_.constant_op_nodes_.emplace(node->GetName(), node);
     } else if (op_type == DATA && node->GetOwnerComputeGraph() != root_graph) {
@@ -857,7 +901,6 @@ Status HybridModelBuilder::IndexSpecialNodes() {
       }
     }
   }
-
   return SUCCESS;
 }
 
@@ -1078,7 +1121,7 @@ Status HybridModelBuilder::TransAllVarData() {
   }
 
   std::vector<NodePtr> variable_node_list;
-  for (auto &it : hybrid_model_.variable_nodes_) {
+  for (auto &it : hybrid_model_.device_variable_nodes_) {
     variable_node_list.emplace_back(it.second);
     GELOGD("[%s] added for trans var data", it.first.c_str());
   }
diff --git a/src/ge/hybrid/model/node_item.cc b/src/ge/hybrid/model/node_item.cc
index 7ec8d946..fa6d28d9 100644
--- a/src/ge/hybrid/model/node_item.cc
+++ b/src/ge/hybrid/model/node_item.cc
@@ -110,8 +110,13 @@ Status NodeItem::Init() {
   (void)AttrUtils::GetInt(op_desc, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
   shape_inference_type = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
 
-  GE_CHK_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node, is_dynamic), "[%s] Failed to get shape status.",
-                    node->GetName().c_str());
+  (void)AttrUtils::GetBool(op_desc, ATTR_NAME_FORCE_UNKNOWN_SHAPE, is_dynamic);
+  GELOGD("node name = %s, is_dynamic = %d.", this->node_name.c_str(), is_dynamic);
+  if (!is_dynamic) {
+    GE_CHK_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node, is_dynamic), "[%s] Failed to get shape status.",
+                      node->GetName().c_str());
+  }
+
   GE_CHK_STATUS_RET(ParseFusedSubgraph(*this), "[%s] Failed to parse fused subgraph", node_name.c_str());
   if (is_dynamic) {
     for (int i = 0; i < num_inputs; ++i) {
diff --git a/src/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/src/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
index 942d6d9e..698b3ed2 100644
--- a/src/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/src/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@@ -158,9 +158,13 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()>
   auto op_desc = context.GetNodeItem().op_desc;
   GE_CHECK_NOTNULL(op_desc);
   GELOGI("[%s] ExecuteAsync Start.", op_desc->GetName().c_str());
-  for (auto &task : tasks_) {
+  for (auto it = tasks_.begin(); it != tasks_.end(); ++it) {
+    // AtomicAddrClean has 2 tasks
+    if (tasks_.size() == 2 && it == tasks_.begin() && !(*(tasks_.rbegin()))->GetClearAtomic()) {
+      continue;
+    }
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start");
-    GE_CHK_STATUS_RET_NOLOG(task->LaunchKernel(context.GetStream()));
+    GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream()));
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");
   }
@@ -180,8 +184,12 @@ Status AiCoreNodeTask::UpdateArgs(TaskContext &context) {
   auto op_desc = context.GetNodeItem().op_desc;
   GE_CHECK_NOTNULL(op_desc);
   GELOGI("[%s] AiCoreNodeTask UpdateArgs Start.", op_desc->GetName().c_str());
-  for (auto &task : tasks_) {
-    GE_CHK_STATUS_RET_NOLOG(task->UpdateArgs(context));
+  for (auto it = tasks_.rbegin(); it != tasks_.rend(); ++it) {
+    GE_CHK_STATUS_RET_NOLOG((*it)->UpdateArgs(context));
+    // AtomicAddrClean has 2 tasks
+    if (tasks_.size() == 2 && it == tasks_.rbegin() && !(*it)->GetClearAtomic()) {
+      break;
+    }
   }
   GELOGI("[%s] AiCoreNodeTask UpdateArgs End.", op_desc->GetName().c_str());
   return SUCCESS;
@@ -189,8 +197,12 @@ Status AiCoreNodeTask::UpdateArgs(TaskContext &context) {
 
 Status AiCoreNodeTask::UpdateTilingData(TaskContext &context) {
   GELOGD("[%s] PrepareWithShape started", context.GetNodeName());
-  for (auto &task : tasks_) {
-    GE_CHK_STATUS_RET_NOLOG(task->PrepareWithShape(context));
+  for (auto it = tasks_.rbegin(); it != tasks_.rend(); ++it) {
+    GE_CHK_STATUS_RET_NOLOG((*it)->PrepareWithShape(context));
+    // AtomicAddrClean has 2 tasks
+    if (tasks_.size() == 2 && it == tasks_.rbegin() && !(*it)->GetClearAtomic()) {
+      break;
+    }
   }
   GELOGD("[%s] Done PrepareWithShape successfully.", context.GetNodeName());
   return SUCCESS;
diff --git a/src/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/src/ge/hybrid/node_executor/aicore/aicore_op_task.cc
index 9ec0cc22..838b8e40 100644
--- a/src/ge/hybrid/node_executor/aicore/aicore_op_task.cc
+++ b/src/ge/hybrid/node_executor/aicore/aicore_op_task.cc
@@ -112,6 +112,7 @@ Status AiCoreOpTask::UpdateTilingInfo(TaskContext &context) {
   GELOGD("[%s] Start to update tiling info for task: [%s]", node->GetName().c_str(), stub_name_.c_str());
   OpRunInfo tiling_info;
   tiling_info.block_dim = -1;  // codex: Using uninitialized value
+  tiling_info.clear_atomic = true;
 
   auto execution_context = context.GetExecutionContext();
   RECORD_EXECUTION_EVENT(execution_context, context.GetNodeName(), "[CalcTilingInfo] Start");
@@ -121,6 +122,7 @@ Status AiCoreOpTask::UpdateTilingInfo(TaskContext &context) {
   // update op args by tiling info
   block_dim_ = static_cast<uint32_t>(tiling_info.block_dim);
   op_desc->SetWorkspaceBytes(tiling_info.workspaces);
+  clear_atomic_ = tiling_info.clear_atomic;
 
   tiling_data_ = tiling_info.tiling_data.str();
   if (tiling_data_.empty()) {
diff --git a/src/ge/hybrid/node_executor/aicore/aicore_op_task.h b/src/ge/hybrid/node_executor/aicore/aicore_op_task.h
index 41ab0d79..d5e5a56c 100644
--- a/src/ge/hybrid/node_executor/aicore/aicore_op_task.h
+++ b/src/ge/hybrid/node_executor/aicore/aicore_op_task.h
@@ -46,6 +46,8 @@ class AiCoreOpTask {
 
   const std::string &GetName() const;
 
+  bool GetClearAtomic() const { return clear_atomic_; }
+
  protected:
   Status UpdateTilingInfo(TaskContext &context);
   virtual std::string GetKeyForOpParamSize() const;
@@ -66,6 +68,7 @@ class AiCoreOpTask {
   std::unique_ptr<uint8_t[]> args_ = nullptr;
   uint32_t args_size_ = 0;
   uint32_t block_dim_ = 1;
+  bool clear_atomic_ = true;
 };
 
 class AtomicAddrCleanOpTask : public AiCoreOpTask {
diff --git a/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc b/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc
index 588f179d..e67a36c6 100644
--- a/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc
+++ b/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc
@@ -17,6 +17,7 @@
 #include "aicore_task_compiler.h"
 #include "framework/common/debug/log.h"
 #include "graph/debug/ge_attr_define.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 namespace ge {
 namespace hybrid {
@@ -30,23 +31,23 @@ std::mutex AiCoreTaskCompiler::mu_;
 AiCoreTaskCompiler::AiCoreTaskCompiler(OpsKernelInfoStorePtr aic_kernel_store)
     : aic_kernel_store_(std::move(aic_kernel_store)) {}
 
-Status AiCoreTaskCompiler::DoCompileOp(OpsKernelInfoStore &ops_store, const NodePtr &node) {
+Status AiCoreTaskCompiler::DoCompileOp(const NodePtr &node) const {
   GE_CHECK_NOTNULL(node);
+  GE_CHECK_NOTNULL(aic_kernel_store_);
   vector<NodePtr> node_vec;
   node_vec.emplace_back(node);
-  GE_CHK_STATUS_RET(ops_store.CompileOpRun(node_vec), "Failed to execute CompileOp, node = %s",
-                    node->GetName().c_str());
-  GE_CHK_STATUS_RET(ops_store.CalcOpRunningParam(*node), "Failed to execute CalcOpRunningParam, node = %s",
+  GE_CHK_STATUS_RET(aic_kernel_store_->CompileOpRun(node_vec), "Failed to execute CompileOp, node = %s",
                     node->GetName().c_str());
+  GE_CHK_STATUS_RET(OpsKernelBuilderManager::Instance().CalcOpRunningParam(*node),
+                    "Failed to execute CalcOpRunningParam, node = %s", node->GetName().c_str());
   return SUCCESS;
 }
 
-Status AiCoreTaskCompiler::CompileOp(const NodePtr &node, std::vector<domi::TaskDef> &tasks) const {
+Status AiCoreTaskCompiler::CompileOp(const NodePtr &node, std::vector<domi::TaskDef> &tasks) {
   GE_CHECK_NOTNULL(node);
   GELOGI("AiCoreTaskCompiler(%s) CompileOp Start.", node->GetName().c_str());
-  GE_CHECK_NOTNULL(aic_kernel_store_);
 
-  GE_CHK_STATUS_RET_NOLOG(DoCompileOp(*aic_kernel_store_, node));
+  GE_CHK_STATUS_RET_NOLOG(DoCompileOp(node));
   GELOGD("successfully compiled op: %s", node->GetName().c_str());
 
   auto op_desc = node->GetOpDesc();
@@ -56,14 +57,13 @@ Status AiCoreTaskCompiler::CompileOp(const NodePtr &node, std::vector<domi::Task
   op_desc->SetOutputOffset(output_offsets);
   std::vector<int64_t> workspaces(op_desc->GetWorkspaceBytes().size(), kMemBase);
   op_desc->SetWorkspace(std::move(workspaces));
-  GE_CHK_STATUS_RET_NOLOG(DoGenerateTask(*aic_kernel_store_, *node, tasks));
+  GE_CHK_STATUS_RET_NOLOG(DoGenerateTask(*node, tasks));
   GELOGD("successfully generated task: %s", node->GetName().c_str());
   GELOGI("AiCoreTaskCompiler(%s) CompileOp End.", node->GetName().c_str());
   return SUCCESS;
 }
 
-Status AiCoreTaskCompiler::DoGenerateTask(OpsKernelInfoStore &store, const Node &node,
-                                          std::vector<domi::TaskDef> &tasks) {
+Status AiCoreTaskCompiler::DoGenerateTask(const Node &node, std::vector<domi::TaskDef> &tasks) {
   rtModel_t rt_model_ = nullptr;
   GE_CHK_RT_RET(rtModelCreate(&rt_model_, 0));
   rtStream_t stream = nullptr;
@@ -83,7 +83,7 @@ Status AiCoreTaskCompiler::DoGenerateTask(OpsKernelInfoStore &store, const Node
   Status ret;
   {
     std::lock_guard<std::mutex> lk(mu_);
-    ret = store.GenerateTask(node, context, tasks);
+    ret = OpsKernelBuilderManager::Instance().GenerateTask(node, context, tasks);
   }
 
   GE_CHK_STATUS(ret, "Failed to execute GenerateTask, node = %s", node.GetName().c_str());
diff --git a/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.h b/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
index 39673188..cc3897ca 100644
--- a/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
+++ b/src/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
@@ -27,11 +27,11 @@ class AiCoreTaskCompiler {
   explicit AiCoreTaskCompiler(OpsKernelInfoStorePtr aic_kernel_store);
   ~AiCoreTaskCompiler() = default;
 
-  Status CompileOp(const NodePtr &node, std::vector<domi::TaskDef> &tasks) const;
+  Status CompileOp(const NodePtr &node, std::vector<domi::TaskDef> &tasks);
 
  private:
-  static Status DoCompileOp(OpsKernelInfoStore &store, const NodePtr &node);
-  static Status DoGenerateTask(OpsKernelInfoStore &store, const Node &node, std::vector<domi::TaskDef> &tasks);
+  Status DoCompileOp(const NodePtr &node) const;
+  Status DoGenerateTask(const Node &node, std::vector<domi::TaskDef> &tasks);
   OpsKernelInfoStorePtr aic_kernel_store_;
   static std::mutex mu_;
 };
diff --git a/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
index 871f1db4..0cd2daf4 100644
--- a/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
+++ b/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
@@ -20,7 +20,7 @@
 #include "graph/load/new_model_manager/model_manager.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/model/hybrid_model.h"
-#include "init/gelib.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 namespace ge {
 namespace hybrid {
@@ -448,14 +448,10 @@ Status AicpuTfNodeTask::PrepareCopyInputs(const TaskContext &context,
 }
 
 Status AicpuTfNodeTask::GenMemCopyTask(uint64_t copy_num, STR_FWK_OP_KERNEL &task, std::string &task_info) {
-  auto instance_ptr = ge::GELib::GetInstance();
-  GE_CHK_BOOL_RET_STATUS(instance_ptr != nullptr && instance_ptr->InitFlag(), GE_CLI_GE_NOT_INITIALIZED,
-                         "GE is not initialized");
-
-  static constexpr const char *const kKernelLibName = "aicpu_kernel";
-  OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName);
-  GE_CHK_BOOL_RET_STATUS(kernel_info != nullptr, FAILED, "Get op kernel info store[%s] failed", kKernelLibName);
-  auto ret = kernel_info->GenMemCopyTask(copy_num, task, task_info);
+  static constexpr const char *const kKernelLibName = "aicpu_tf_kernel";
+  auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName);
+  GE_CHK_BOOL_RET_STATUS(kernel_builder != nullptr, FAILED, "Get op kernel info store[%s] failed", kKernelLibName);
+  auto ret = kernel_builder->GenMemCopyTask(copy_num, task, task_info);
   GE_CHK_STATUS_RET(ret, "Call aicpu GenMemCopyTask failed, copy_num=%lu, ret=%u", copy_num, ret);
   return SUCCESS;
 }
diff --git a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
index f2cd1888..1edd6135 100644
--- a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
+++ b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
@@ -17,6 +17,7 @@
 #include "hybrid/node_executor/hccl/hccl_node_executor.h"
 #include "common/ge/ge_util.h"
 #include "common/ge/plugin_manager.h"
+#include "common/math/math_util.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/attr_value.h"
 #include "graph/debug/ge_attr_define.h"
@@ -162,12 +163,13 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess
     return PARAM_INVALID;
   }
 
-  size_t remote_size = 0;
-  for (auto idx = 0; idx < dims.front(); ++idx) {
-    remote_size += data[idx * kVarTableRowCnt + kVarTableIdxLen];
-  }
-
   if (context.GetNodeItem().NodeType() == HCOMREMOTEREAD) {
+    size_t remote_size = 0;
+    for (auto idx = 0; idx < dims.front(); ++idx) {
+      FMK_INT64_MULCHECK(idx, kVarTableRowCnt);
+      auto line_idx = idx * kVarTableRowCnt;
+      remote_size += data[line_idx + kVarTableIdxLen];
+    }
     auto allocator = NpuMemoryAllocator::GetAllocator();
     GE_CHECK_NOTNULL(allocator);
     AllocationAttr attr;
@@ -187,11 +189,13 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess
   }
   GE_CHECK_NOTNULL(tv);
   auto local_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(tv->MutableData()));
+  addr_infos.resize(dims.front());
   for (auto idx = 0; idx < dims.front(); ++idx) {
-    addr_infos.push_back({static_cast<uint32_t>(data[idx * kVarTableRowCnt]),
-                          data[idx * kVarTableRowCnt + kVarTableIdxAddr], local_addr,
-                          data[idx * kVarTableRowCnt + kVarTableIdxLen]});
-    local_addr += data[idx * kVarTableRowCnt + kVarTableIdxLen];
+    FMK_INT64_MULCHECK(idx, kVarTableRowCnt);
+    auto line_idx = idx * kVarTableRowCnt;
+    addr_infos[idx] = {static_cast<uint32_t>(data[line_idx]), data[line_idx + kVarTableIdxAddr], local_addr,
+                       data[line_idx + kVarTableIdxLen]};
+    local_addr += data[line_idx + kVarTableIdxLen];
   }
 
   return SUCCESS;
diff --git a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h
index ddf6eb3a..f27ec67c 100644
--- a/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h
+++ b/src/ge/hybrid/node_executor/hccl/hccl_node_executor.h
@@ -16,6 +16,7 @@
 
 #ifndef HYBRID_HCCL_NODE_EXECUTOR_H_
 #define HYBRID_HCCL_NODE_EXECUTOR_H_
+#include "common/opskernel/ge_task_info.h"
 #include "graph/op_desc.h"
 #include "hybrid/model/hybrid_model.h"
 #include "hybrid/node_executor/node_executor.h"
diff --git a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
index 1c98abee..3f6440dc 100644
--- a/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
+++ b/src/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
@@ -110,7 +110,9 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no
   auto op_desc = node->GetOpDesc();
   GE_CHECK_NOTNULL(op_desc);
   auto mem_type = static_cast<uint32_t>(HOST_DDR);
-  (void)AttrUtils::SetInt(op_desc, ATTR_OUTPUT_MEMORY_TYPE, mem_type);
+  for (size_t i = 0; i < op_desc->GetOutputsSize(); i++) {
+    (void)AttrUtils::SetInt(op_desc->MutableOutputDesc(i), ATTR_OUTPUT_MEMORY_TYPE, mem_type);
+  }
   const std::string &name = node->GetName();
   const std::string &type = node->GetType();
   if (HostCpuEngine::GetInstance().CheckSupported(type)) {
@@ -128,4 +130,4 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no
   return SUCCESS;
 }
 }  // namespace hybrid
-}  // namespace ge
\ No newline at end of file
+}  // namespace ge
diff --git a/src/ge/hybrid/node_executor/node_executor.cc b/src/ge/hybrid/node_executor/node_executor.cc
index 8de15ea0..95f1e17c 100644
--- a/src/ge/hybrid/node_executor/node_executor.cc
+++ b/src/ge/hybrid/node_executor/node_executor.cc
@@ -19,13 +19,16 @@
 #include "graph/utils/node_utils.h"
 #include "init/gelib.h"
 #include "hybrid/model/hybrid_model.h"
+#include "graph/debug/ge_attr_define.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 namespace ge {
 namespace hybrid {
 namespace {
 const char *const kEngineNameAiCore = "AIcoreEngine";
 const char *const kEngineNameGeLocal = "DNN_VM_GE_LOCAL_OP_STORE";
-const char *const kEngineNameAiCpu = "aicpu_kernel";
+const char *const kEngineNameAiCpu = "aicpu_ascend_kernel";
+const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel";
 const char *const kEngineNameHccl = "ops_kernel_info_hccl";
 const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE";
 const char *const kEngineNameHostCpu = "DNN_VM_HOST_CPU_OP_STORE";
@@ -61,23 +64,12 @@ Status NodeExecutorManager::EnsureInitialized() {
 
   engine_mapping_.emplace(kEngineNameAiCore, NodeExecutorManager::ExecutorType::AICORE);
   engine_mapping_.emplace(kEngineNameGeLocal, NodeExecutorManager::ExecutorType::GE_LOCAL);
+  engine_mapping_.emplace(kEngineNameAiCpuTf, NodeExecutorManager::ExecutorType::AICPU_TF);
   engine_mapping_.emplace(kEngineNameAiCpu, NodeExecutorManager::ExecutorType::AICPU_TF);
   engine_mapping_.emplace(kEngineNameHccl, NodeExecutorManager::ExecutorType::HCCL);
   engine_mapping_.emplace(kEngineNameRts, NodeExecutorManager::ExecutorType::RTS);
   engine_mapping_.emplace(kEngineNameHostCpu, NodeExecutorManager::ExecutorType::HOST_CPU);
 
-  std::shared_ptr<GELib> instance_ptr = GELib::GetInstance();
-  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
-    GELOGW("GELib not initialized");
-    return FAILED;
-  }
-
-  OpsKernelManager &ops_kernel_manager = instance_ptr->OpsKernelManagerObj();
-  for (auto &it : ops_kernel_manager.GetAllOpsKernelInfoStores()) {
-    GELOGD("add kernel store: %s", it.first.c_str());
-    kernel_stores_.emplace(it.first, it.second.get());
-  }
-
   initialized_ = true;
   GELOGI("Initializing NodeExecutors successfully");
   return SUCCESS;
@@ -86,6 +78,11 @@ Status NodeExecutorManager::EnsureInitialized() {
 NodeExecutorManager::ExecutorType NodeExecutorManager::ResolveExecutorType(Node &node) const {
   auto op_type = node.GetType();
   if (op_type == PARTITIONEDCALL) {
+    const auto &subgraph = NodeUtils::GetSubgraph(node, 0);
+    if (subgraph != nullptr && subgraph->GetGraphUnknownFlag()) {
+      GELOGD("node %s was marked as unknown shape in node_executor.", node.GetName().c_str());
+      return ExecutorType::DYNAMIC_SUBGRAPH;
+    }
     bool is_dynamic = false;
     (void)NodeUtils::GetNodeUnknownShapeStatus(node, is_dynamic);
     if (is_dynamic) {
@@ -144,13 +141,6 @@ Status NodeExecutorManager::CalcOpRunningParam(Node &node) const {
     TensorUtils::SetSize(*(output_tensor.get()), 0);
   }
 
-  auto it = kernel_stores_.find(op_desc->GetOpKernelLibName());
-  if (it == kernel_stores_.end()) {
-    GELOGE(INTERNAL_ERROR, "Failed to get OpKernelStore. libName = %s, node = %s",
-           op_desc->GetOpKernelLibName().c_str(), op_desc->GetName().c_str());
-    return INTERNAL_ERROR;
-  }
-
   // calc hccl output size independent, hccl ops kernel manager should GetSize for
   // input which is the output size of input-op, but sometimes return error
   // when multi-thread
@@ -173,7 +163,8 @@ Status NodeExecutorManager::CalcOpRunningParam(Node &node) const {
     }
     return SUCCESS;
   }
-  return it->second->CalcOpRunningParam(node);
+
+  return OpsKernelBuilderManager::Instance().CalcOpRunningParam(node);
 }
 
 Status NodeExecutorManager::InitializeExecutors() {
diff --git a/src/ge/hybrid/node_executor/node_executor.h b/src/ge/hybrid/node_executor/node_executor.h
index 79726b09..4fd55410 100644
--- a/src/ge/hybrid/node_executor/node_executor.h
+++ b/src/ge/hybrid/node_executor/node_executor.h
@@ -18,7 +18,7 @@
 #define GE_HYBRID_NODE_EXECUTOR_NODE_EXECUTOR_H_
 
 #include "external/ge/ge_api_error_codes.h"
-#include "common/opskernel/ops_kernel_info_store.h"
+#include "common/opskernel/ops_kernel_builder.h"
 #include "graph/node.h"
 #include "task_context.h"
 
@@ -186,7 +186,6 @@ class NodeExecutorManager {
  private:
   std::map<ExecutorType, std::unique_ptr<NodeExecutor>> executors_;
   std::map<ExecutorType, std::function<NodeExecutor *()>> builders_;
-  std::map<std::string, OpsKernelInfoStore *> kernel_stores_;
   std::map<std::string, NodeExecutorManager::ExecutorType> engine_mapping_;
   std::mutex mu_;
   bool initialized_ = false;
diff --git a/src/ge/hybrid/node_executor/task_context.cc b/src/ge/hybrid/node_executor/task_context.cc
index e49a2b43..430ec63f 100644
--- a/src/ge/hybrid/node_executor/task_context.cc
+++ b/src/ge/hybrid/node_executor/task_context.cc
@@ -227,7 +227,7 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) {
     const auto &output_desc = node_item_->op_desc->MutableOutputDesc(i);
     GE_CHECK_NOTNULL(output_desc);
     uint32_t mem_type = 0;
-    (void)AttrUtils::GetInt(node_item_->op_desc, ATTR_OUTPUT_MEMORY_TYPE, mem_type);
+    (void)AttrUtils::GetInt(output_desc, ATTR_OUTPUT_MEMORY_TYPE, mem_type);
     if (attr == nullptr) {
       auto tmp_attr = AllocationAttr(0, nullptr, static_cast<MemStorageType>(mem_type));
       GE_CHK_STATUS_RET_NOLOG(AllocateOutput(i, *output_desc, nullptr, &tmp_attr));
diff --git a/src/ge/inc/graph_pass.h b/src/ge/inc/graph_pass.h
index d4abdd2f..d3bc931c 100644
--- a/src/ge/inc/graph_pass.h
+++ b/src/ge/inc/graph_pass.h
@@ -22,6 +22,7 @@
 
 #include "common/op/attr_value_util.h"
 #include "common/op/ge_op_utils.h"
+#include "common/types.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/compute_graph.h"
 #include "graph/utils/attr_utils.h"
diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc
index e00268ea..838aeb0d 100644
--- a/src/ge/init/gelib.cc
+++ b/src/ge/init/gelib.cc
@@ -39,9 +39,11 @@
 #include "graph/ge_global_options.h"
 #include "graph/load/new_model_manager/model_manager.h"
 #include "graph/manager/graph_mem_allocator.h"
+#include "graph/manager/host_mem_manager.h"
 #include "graph/manager/graph_var_manager.h"
 #include "omm/csa_interact.h"
 #include "runtime/kernel.h"
+#include "opskernel_manager/ops_kernel_builder_manager.h"
 
 using Json = nlohmann::json;
 
@@ -74,6 +76,7 @@ Status GELib::Initialize(const map<string, string> &options) {
     GELOGE(ret, "GeLib initial failed.");
     return ret;
   }
+  instancePtr_->SetDefaultPrecisionMode(new_options);
   GetMutableGlobalOptions().insert(new_options.begin(), new_options.end());
   GetThreadLocalContext().SetGlobalOption(GetMutableGlobalOptions());
   GE_TIMESTAMP_START(Init);
@@ -124,6 +127,16 @@ Status GELib::InnerInitialize(const map<string, string> &options) {
     return initOpsStatus;
   }
 
+  GELOGI("opsBuilderManager initial.");
+  GE_TIMESTAMP_START(OpsKernelBuilderManagerInitialize);
+  Status initOpsBuilderStatus = OpsKernelBuilderManager::Instance().Initialize(options);
+  GE_TIMESTAMP_END(OpsKernelBuilderManagerInitialize, "InnerInitialize::OpsKernelBuilderManager");
+  if (initOpsBuilderStatus != SUCCESS) {
+    GELOGE(initOpsBuilderStatus);
+    RollbackInit();
+    return initOpsBuilderStatus;
+  }
+
   GELOGI("sessionManager initial.");
   GE_TIMESTAMP_START(SessionManagerInitialize);
   Status initSmStatus = sessionManager_.Initialize(options);
@@ -195,6 +208,26 @@ void GELib::InitProfiling(Options &options) {
   }
 }
 
+void GELib::SetDefaultPrecisionMode(map<string, string> &new_options) {
+  auto iter = new_options.find(PRECISION_MODE);
+  if (iter != new_options.end()) {
+    GELOGI("Find precision_mode in options, value is %s", iter->second.c_str());
+    return;
+  }
+  iter = new_options.find(OPTION_GRAPH_RUN_MODE);
+  if (iter != new_options.end()) {
+    if (GraphRunMode(std::strtol(iter->second.c_str(), nullptr, kDecimal)) >= TRAIN) {
+      // only train mode need to be set allow_fp32_to_fp16.
+      GELOGI("This is train mode, precision_mode need to be set allow_fp32_to_fp16");
+      new_options.insert(std::make_pair(PRECISION_MODE, "allow_fp32_to_fp16"));
+      return;
+    }
+  }
+  GELOGI("This is not train mode, precision_mode need to be set force_fp16");
+  new_options.insert(std::make_pair(PRECISION_MODE, "force_fp16"));
+  return;
+}
+
 Status GELib::SetRTSocVersion(const map<string, string> &options, map<string, string> &new_options) {
   GELOGI("Start to set SOC_VERSION");
   new_options.insert(options.begin(), options.end());
@@ -281,12 +314,14 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt
 
   std::vector<rtMemType_t> mem_type;
   mem_type.push_back(RT_MEMORY_HBM);
+  mem_type.push_back(RT_MEMORY_P2P_DDR);
   Status initMmStatus = MemManager::Instance().Initialize(mem_type);
   if (initMmStatus != SUCCESS) {
     GELOGE(initMmStatus, "[Initialize] MemoryAllocatorManager initialize failed.");
     return initMmStatus;
   }
 
+  GE_CHK_STATUS_RET(HostMemManager::Instance().Initialize());
   // Update CSA file
   CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId());
   Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT);
@@ -334,11 +369,13 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithout
 
   std::vector<rtMemType_t> mem_type;
   mem_type.push_back(RT_MEMORY_HBM);
+  mem_type.push_back(RT_MEMORY_P2P_DDR);
   Status initMmStatus = MemManager::Instance().Initialize(mem_type);
   if (initMmStatus != SUCCESS) {
     GELOGE(initMmStatus, "[Initialize] MemoryAllocatorManager initialize failed.");
     return initMmStatus;
   }
+  GE_CHK_STATUS_RET(HostMemManager::Instance().Initialize());
 
   static bool is_inited = false;
   if (is_inited) {
@@ -379,6 +416,12 @@ Status GELib::Finalize() {
     final_state = mid_state;
   }
 
+  GELOGI("opsBuilderManager finalization.");
+  mid_state = OpsKernelBuilderManager::Instance().Finalize();
+  if (mid_state != SUCCESS) {
+    GELOGW("opsBuilderManager finalize failed");
+    final_state = mid_state;
+  }
   GELOGI("opsManager finalization.");
   mid_state = opsManager_.Finalize();
   if (mid_state != SUCCESS) {
@@ -392,6 +435,9 @@ Status GELib::Finalize() {
   GELOGI("MemManager finalization.");
   MemManager::Instance().Finalize();
 
+  GELOGI("HostMemManager finalization.");
+  HostMemManager::Instance().Finalize();
+
   GELOGI("HostCpuEngine finalization.");
   HostCpuEngine::GetInstance().Finalize();
 
@@ -453,6 +499,7 @@ void GELib::RollbackInit() {
     (void)sessionManager_.Finalize();
   }
   MemManager::Instance().Finalize();
+  HostMemManager::Instance().Finalize();
   VarManagerPool::Instance().Destory();
 }
 }  // namespace ge
diff --git a/src/ge/init/gelib.h b/src/ge/init/gelib.h
index b5621dfd..0401ecf4 100644
--- a/src/ge/init/gelib.h
+++ b/src/ge/init/gelib.h
@@ -81,6 +81,7 @@ class GELib {
   Status InnerInitialize(const map<string, string> &options);
   Status SystemInitialize(const map<string, string> &options);
   Status SetRTSocVersion(const map<string, string> &options, map<string, string> &new_options);
+  void SetDefaultPrecisionMode(map<string, string> &new_options);
   void RollbackInit();
   void InitOptions(const map<string, string> &options);
   void SetDumpModelOptions(const map<string, string> &options);
diff --git a/src/ge/ir_build/atc_ir_common.cc b/src/ge/ir_build/atc_ir_common.cc
index 1f8abf37..658deea0 100644
--- a/src/ge/ir_build/atc_ir_common.cc
+++ b/src/ge/ir_build/atc_ir_common.cc
@@ -68,21 +68,18 @@ bool CheckDynamicBatchSizeInputShapeValid(unordered_map<string, vector<int64_t>>
   int32_t size = 0;
   for (auto iter = shape_map.begin(); iter != shape_map.end(); ++iter) {
     vector<int64_t> shape = iter->second;
-    if (shape.size() < 1) {
+    if (shape.empty()) {
       ErrorManager::GetInstance().ATCReportErrMessage("E10012");
       GELOGE(ge::PARAM_INVALID, "--input_shape's shape size can not be less than 1 when set --dynamic_batch_size.");
       return false;
     }
-    if (shape[0] == kDynamicInputDim) {
-      for (size_t i = 1; i < shape.size(); ++i) {
-        if (shape[i] < 1) {
-          ErrorManager::GetInstance().ATCReportErrMessage("E10018", {"index", "shape"},
-                                                          {std::to_string(i), std::to_string(shape[i])});
-          GELOGE(ge::PARAM_INVALID, "Only batch N can be -1 when set --dynamic_batch_size, current shape[%zu] is %ld",
-                 i, shape[i]);
-          return false;
-        }
-      }
+
+    if (std::count(shape.begin(), shape.end(), kDynamicInputDim) == 0) {
+      continue;
+    }
+
+    bool ret = multibatch::CheckDynamicBatchShape(shape, iter->first);
+    if (ret) {
       size++;
     }
   }
@@ -111,7 +108,7 @@ bool CheckDynamicBatchSizeInputShapeValid(unordered_map<string, vector<int64_t>>
 bool CheckDynamicImagesizeInputShapeValid(unordered_map<string, vector<int64_t>> shape_map,
                                           const std::string input_format, std::string &dynamic_image_size) {
   int32_t size = 0;
-  for (unordered_map<string, vector<int64_t>>::iterator iter = shape_map.begin(); iter != shape_map.end(); ++iter) {
+  for (auto iter = shape_map.begin(); iter != shape_map.end(); ++iter) {
     vector<int64_t> shape = iter->second;
     // only support four dim
     if (shape.size() != DIM_DEFAULT_SIZE) {
@@ -124,28 +121,14 @@ bool CheckDynamicImagesizeInputShapeValid(unordered_map<string, vector<int64_t>>
       continue;
     }
 
-    int64_t height = 0;
-    int64_t width = 0;
-    if (input_format == "NCHW") {
-      height = shape[NCHW_DIM_H];
-      width = shape[NCHW_DIM_W];
-    }
-
-    if (input_format == "NHWC") {
-      height = shape[NHWC_DIM_H];
-      width = shape[NHWC_DIM_W];
+    if (std::count(shape.begin(), shape.end(), kDynamicInputDim) == 0) {
+      continue;
     }
-
-    if (height == kDynamicInputDim && width == kDynamicInputDim &&
-        std::count(shape.begin(), shape.end(), kDynamicInputDim) == kDynamicImageSizeNum) {
+    auto ret = multibatch::CheckDynamicImageSizeShape(shape, iter->first, input_format);
+    if (ret) {
       size++;
-    } else if (std::count(shape.begin(), shape.end(), kDynamicInputDim) == 0) {
-      continue;
     } else {
-      ErrorManager::GetInstance().ATCReportErrMessage("E10019");
-      GELOGE(ge::PARAM_INVALID,
-             "--input_shape's shape is invalid, only height and width can be -1 when set --dynamic_image_size.");
-      return false;
+      return ret;
     }
   }
   if (size == 0) {
diff --git a/src/ge/ir_build/atc_ir_common.h b/src/ge/ir_build/atc_ir_common.h
index 53143c2b..843c02c5 100644
--- a/src/ge/ir_build/atc_ir_common.h
+++ b/src/ge/ir_build/atc_ir_common.h
@@ -27,6 +27,7 @@
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/omg/omg_inner_types.h"
+#include "graph/preprocess/multi_batch_options.h"
 
 namespace ge {
 static std::set<std::string> caffe_support_input_format = {"NCHW", "ND"};
diff --git a/src/ge/ir_build/ge_ir_build.cc b/src/ge/ir_build/ge_ir_build.cc
index 86b304c1..82344252 100644
--- a/src/ge/ir_build/ge_ir_build.cc
+++ b/src/ge/ir_build/ge_ir_build.cc
@@ -35,6 +35,7 @@
 #include "init/gelib.h"
 #include "ir_build/atc_ir_common.h"
 #include "model/ge_model.h"
+#include "graph/shape_refiner.h"
 
 using std::string;
 using namespace std;
@@ -167,6 +168,7 @@ class Impl {
   graphStatus InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format,
                                  bool is_dynamic_input);
   void SetRtSocVersion();
+  void UpdateThreadContext();
 
  public:
   ge::GeGenerator generator_;
@@ -220,8 +222,6 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
     return ret;
   }
 
-  GetThreadLocalContext().SetGlobalOption(GetMutableGlobalOptions());
-  GetThreadLocalContext().SetGraphOption(options_);
   std::string build_mode = (options_.find(BUILD_MODE) == options_.end() || options_[BUILD_MODE] == BUILD_MODE_NORMAL)
                              ? ""
                              : options_[BUILD_MODE];
@@ -266,6 +266,9 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   GE_CHK_BOOL_EXEC(ge::CheckInsertOpConfParamValid(std::string(insert_op_conf)) == ge::SUCCESS,
                    return ge::GRAPH_PARAM_INVALID, "check insert op conf failed!");
 
+  GE_CHK_BOOL_EXEC(insert_op_conf.empty() || dynamic_dims.empty(), return ge::GRAPH_PARAM_INVALID,
+                   "dynamic dims function does not support aipp");
+
   // for IR builder.Only support om mode, so here fixed;
   options_.insert(std::pair<string, string>(string(IR_OPTION_MODE), to_string(0)));
   options_.insert(std::pair<string, string>(string(IR_OPTION_TARGET), "mini"));
@@ -276,7 +279,7 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
   ge::PrintOptionMap(options_, "ge option");
 
   SetRtSocVersion();
-
+  UpdateThreadContext();
   // 3. init generator with options_
   ret = generator_.Initialize(options_, omg_context_);
   if (ret != GRAPH_SUCCESS) {
@@ -288,7 +291,7 @@ graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
 }
 
 void Impl::SetRtSocVersion() {
-  auto &global_options = GetMutableGlobalOptions();
+  const auto &global_options = GetMutableGlobalOptions();
   auto it = global_options.find(ge::SOC_VERSION);
   if (it != global_options.end()) {
     const char *soc_version = it->second.c_str();
@@ -300,6 +303,11 @@ void Impl::SetRtSocVersion() {
   }
 }
 
+void Impl::UpdateThreadContext() {
+  GetThreadLocalContext().SetGlobalOption(GetMutableGlobalOptions());
+  GetThreadLocalContext().SetGraphOption(options_);
+}
+
 graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTensor> &inputs) {
   auto compute_graph = ge::GraphUtils::GetComputeGraph(graph);
   GE_CHECK_NOTNULL(compute_graph);
@@ -323,13 +331,16 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTe
         data_shape = tensor.GetShape();
         GELOGI("Data op get shape from InputDesc in ge ir graph.");
       }
-
+      // If user point input format, do work for all data ops; else do according to tensor_desc
+      auto data_format = omg_context_.format != domi::DOMI_TENSOR_ND
+                           ? ge::TypeUtils::DomiFormatToFormat(omg_context_.format)
+                           : tensor.GetFormat();
       ge::DataType data_type = tensor.GetDataType();
       string data_type_str = ge::TypeUtils::DataTypeToSerialString(data_type);
       GELOGI("Data op get data type:%s from InputDesc in ge ir graph.", data_type_str.c_str());
 
       ge::GeTensor inputTensor;
-      ge::GeTensorDesc desc(data_shape, ge::Format(omg_context_.format), data_type);
+      ge::GeTensorDesc desc(data_shape, ge::Format(data_format), data_type);
       inputTensor.SetTensorDesc(desc);
       inputs.push_back(inputTensor);
     }
@@ -422,4 +433,77 @@ graphStatus aclgrphGetIRVersion(int *major_version, int *minor_version, int *pat
   *patch_version = IR_PATCH_VERSION;
   return GRAPH_SUCCESS;
 }
+
+graphStatus aclgrphInferShapeAndType(ge::Graph &graph) {
+  auto compute_graph = GraphUtils::GetComputeGraph(graph);
+  GE_CHECK_NOTNULL(compute_graph);
+
+  for (auto &node : compute_graph->GetAllNodes()) {
+    graphStatus ret = ShapeRefiner::InferShapeAndType(node);
+    if (ret == GRAPH_PARAM_INVALID) {
+      GELOGW("Can not find infershape func.");
+      continue;
+    } else if (ret != GRAPH_SUCCESS) {
+      GELOGE(ret, "Acl infershape failed.");
+      return ret;
+    }
+  }
+
+  return GRAPH_SUCCESS;
+}
+
+graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const size_t len) {
+  GE_CHECK_NOTNULL(file);
+
+  if (len > PATH_MAX || len != strlen(file) || strlen(file) == 0) {
+    GELOGE(GRAPH_PARAM_INVALID, "File path invalid.");
+    return GRAPH_PARAM_INVALID;
+  }
+
+  auto compute_graph = GraphUtils::GetComputeGraph(graph);
+  GE_CHECK_NOTNULL(compute_graph);
+
+  string full_path(file, len);
+  for (size_t i = 0; i < len; i++) {
+    if (full_path[i] == '\\') {
+      full_path.replace(i, 1, "/");
+    }
+  }
+
+  string suffix;
+  string file_path;
+  int pos = full_path.rfind("/");
+  if (pos != -1) {
+    suffix = full_path.substr(pos + 1, -1);
+    file_path = full_path.substr(0, pos);
+  } else {
+    suffix = full_path;
+    file_path = "./";
+  }
+
+  if (suffix.empty()) {
+    suffix = compute_graph->GetName();
+    if (suffix.empty()) {
+      suffix = "graph";
+    }
+  }
+
+  char path[PATH_MAX] = {0};
+  if (realpath(file_path.c_str(), path) == nullptr) {
+    GELOGE(GRAPH_PARAM_INVALID, "Dump file path:%s  is invalid.", file);
+    return GRAPH_PARAM_INVALID;
+  }
+
+  GraphUtils::DumpGEGrph(compute_graph, string(path), suffix);
+  GraphUtils::DumpGrphToOnnx(*compute_graph, string(path), suffix);
+  uint64_t i = 0;
+  for (const auto &sub_graph_func : compute_graph->GetAllSubgraphs()) {
+    auto sub_graph_func_name = suffix + std::string("_sub_graph_") + std::to_string(i++);
+    GraphUtils::DumpGEGrph(sub_graph_func, string(path), sub_graph_func_name);
+    GraphUtils::DumpGrphToOnnx(*sub_graph_func, string(path), sub_graph_func_name);
+  }
+
+  return GRAPH_SUCCESS;
+}
+
 }  // namespace ge
diff --git a/src/ge/model/ge_model.cc b/src/ge/model/ge_model.cc
index 70251876..9ba4c2b7 100644
--- a/src/ge/model/ge_model.cc
+++ b/src/ge/model/ge_model.cc
@@ -23,6 +23,7 @@
 namespace ge {
 void GeModel::Init() {
   (void)AttrUtils::SetInt(this, ATTR_MODEL_MEMORY_SIZE, 0);
+  (void)AttrUtils::SetInt(this, ATTR_MODEL_P2P_MEMORY_SIZE, 0);
   (void)AttrUtils::SetInt(this, ATTR_MODEL_STREAM_NUM, 0);
   (void)AttrUtils::SetInt(this, ATTR_MODEL_EVENT_NUM, 0);
   (void)AttrUtils::SetInt(this, ATTR_MODEL_LABEL_NUM, 0);
diff --git a/src/ge/opskernel_manager/ops_kernel_builder_manager.cc b/src/ge/opskernel_manager/ops_kernel_builder_manager.cc
new file mode 100644
index 00000000..6afcc891
--- /dev/null
+++ b/src/ge/opskernel_manager/ops_kernel_builder_manager.cc
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "init/gelib.h"
+#include "ops_kernel_builder_manager.h"
+#include "register/ops_kernel_builder_registry.h"
+
+namespace ge {
+namespace {
+const std::vector<std::string> kBasicBuilderLibs = {"libge_local_opskernel_builder.so",
+                                                    "libhost_cpu_opskernel_builder.so", "librts_kernel_builder.so",
+                                                    "libaicpu_ascend_builder.so", "libaicpu_tf_builder.so"};
+
+const std::vector<std::string> kHcclBuilderLibs = {"libhcom_opskernel_builder.so", "libhvd_opskernel_builder.so"};
+}  // namespace
+OpsKernelBuilderManager::~OpsKernelBuilderManager() {
+  // it's OK to call Finalize multiply times
+  (void)Finalize();
+}
+
+OpsKernelBuilderManager &OpsKernelBuilderManager::Instance() {
+  static OpsKernelBuilderManager instance;
+  return instance;
+}
+
+Status OpsKernelBuilderManager::Initialize(const map<std::string, std::string> &options, bool is_train) {
+  if (is_train) {
+    std::string lib_paths;
+    GE_CHK_STATUS_RET_NOLOG(GetLibPaths(options, lib_paths));
+    plugin_manager_.reset(new (std::nothrow) PluginManager());
+    GE_CHECK_NOTNULL(plugin_manager_);
+    GE_CHK_STATUS_RET(plugin_manager_->LoadSo(lib_paths), "Failed to load libs");
+  }
+
+  auto &kernel_builders = OpsKernelBuilderRegistry::GetInstance().GetAll();
+  GELOGI("Number of OpBuild = %zu", kernel_builders.size());
+
+  for (const auto &it : kernel_builders) {
+    const std::string &kernel_lib_name = it.first;
+    GELOGI("Initialize ops kernel util for %s", kernel_lib_name.c_str());
+    GE_CHECK_NOTNULL(it.second);
+    GE_CHK_STATUS_RET(it.second->Initialize(options), "Failed to invoke Initialize, kernel lib name = %s",
+                      kernel_lib_name.c_str());
+
+    ops_kernel_builders_.emplace(kernel_lib_name, it.second);
+  }
+
+  return SUCCESS;
+}
+
+Status OpsKernelBuilderManager::Finalize() {
+  for (const auto &it : ops_kernel_builders_) {
+    const std::string &kernel_lib_name = it.first;
+    GELOGI("Finalize ops kernel util for %s", kernel_lib_name.c_str());
+    auto ret = it.second->Finalize();
+    if (ret != SUCCESS) {
+      GELOGW("Failed to invoke Finalize, kernel lib name = %s", kernel_lib_name.c_str());
+    }
+  }
+
+  ops_kernel_builders_.clear();
+  plugin_manager_.reset();
+  return SUCCESS;
+}
+
+const map<string, OpsKernelBuilderPtr> &OpsKernelBuilderManager::GetAllOpsKernelBuilders() const {
+  return ops_kernel_builders_;
+}
+
+OpsKernelBuilderPtr OpsKernelBuilderManager::GetOpsKernelBuilder(const string &name) const {
+  auto it = ops_kernel_builders_.find(name);
+  if (it != ops_kernel_builders_.end()) {
+    return it->second;
+  }
+
+  GELOGW("Failed to get opsKernelInfoStore object by name. OpKernelLibName is %s", name.c_str());
+  return nullptr;
+}
+
+Status OpsKernelBuilderManager::GetLibPaths(const std::map<std::string, std::string> &options, std::string &lib_paths) {
+  GELOGD("Start to execute GetLibPaths");
+  std::string path_base = PluginManager::GetPath();
+  std::string so_path = "plugin/opskernel/";
+  std::string path = path_base + so_path;
+  std::string all_lib_paths;
+  for (const auto &lib_name : kBasicBuilderLibs) {
+    all_lib_paths += (path + lib_name + ":");
+  }
+
+  auto iter = options.find(OPTION_EXEC_HCCL_FLAG);
+  if (iter == options.end() || iter->second != "0") {
+    for (const auto &lib_name : kHcclBuilderLibs) {
+      all_lib_paths += (path + lib_name + ":");
+    }
+  }
+
+  lib_paths = std::move(all_lib_paths);
+  GELOGI("Get lib paths by default. paths = %s", lib_paths.c_str());
+  return SUCCESS;
+}
+
+Status OpsKernelBuilderManager::CalcOpRunningParam(Node &node) const {
+  auto op_desc = node.GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  const std::string &lib_name = op_desc->GetOpKernelLibName();
+  auto it = ops_kernel_builders_.find(lib_name);
+  if (it == ops_kernel_builders_.end()) {
+    GELOGE(INTERNAL_ERROR, "Failed to get OpKernelStore. libName = %s, node = %s", lib_name.c_str(),
+           op_desc->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+
+  GELOGD("To invoke CalcOpRunningParam, node = %s, lib name = %s", op_desc->GetName().c_str(), lib_name.c_str());
+  GE_CHK_STATUS_RET(it->second->CalcOpRunningParam(node),
+                    "Failed to invoke CalcOpRunningParam, libName = %s, node = %s", lib_name.c_str(),
+                    op_desc->GetName().c_str());
+  GELOGD("Done invoking CalcOpRunningParam successfully");
+  return SUCCESS;
+}
+
+Status OpsKernelBuilderManager::GenerateTask(const Node &node, RunContext &context,
+                                             std::vector<domi::TaskDef> &tasks) const {
+  auto op_desc = node.GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  const std::string &lib_name = op_desc->GetOpKernelLibName();
+  auto it = ops_kernel_builders_.find(lib_name);
+  if (it == ops_kernel_builders_.end()) {
+    GELOGE(INTERNAL_ERROR, "Failed to get OpKernelStore. libName = %s, node = %s", lib_name.c_str(),
+           op_desc->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+
+  GELOGD("To invoke GenerateTask, node = %s, lib name = %s", op_desc->GetName().c_str(), lib_name.c_str());
+  GE_CHK_STATUS_RET(it->second->GenerateTask(node, context, tasks),
+                    "Failed to invoke GenerateTask, libName = %s, node = %s", lib_name.c_str(),
+                    op_desc->GetName().c_str());
+  GELOGD("Done invoking GenerateTask successfully");
+  return SUCCESS;
+}
+}  // namespace ge
\ No newline at end of file
diff --git a/src/ge/opskernel_manager/ops_kernel_builder_manager.h b/src/ge/opskernel_manager/ops_kernel_builder_manager.h
new file mode 100644
index 00000000..a2e9f9e8
--- /dev/null
+++ b/src/ge/opskernel_manager/ops_kernel_builder_manager.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_OPSKERNEL_MANAGER_OPS_KERNEL_BUILDER_MANAGER_H_
+#define GE_OPSKERNEL_MANAGER_OPS_KERNEL_BUILDER_MANAGER_H_
+
+#include "common/ge/plugin_manager.h"
+#include "common/opskernel/ops_kernel_builder.h"
+#include "external/ge/ge_api_error_codes.h"
+
+namespace ge {
+using OpsKernelBuilderPtr = std::shared_ptr<OpsKernelBuilder>;
+class OpsKernelBuilderManager {
+ public:
+  ~OpsKernelBuilderManager();
+
+  static OpsKernelBuilderManager &Instance();
+
+  // opsKernelManager initialize, load all opsKernelInfoStore and graph_optimizer
+  Status Initialize(const std::map<std::string, std::string> &options, bool is_train = true);
+
+  // opsKernelManager finalize, unload all opsKernelInfoStore and graph_optimizer
+  Status Finalize();
+
+  // get opsKernelIBuilder by name
+  OpsKernelBuilderPtr GetOpsKernelBuilder(const std::string &name) const;
+
+  // get all opsKernelBuilders
+  const std::map<string, OpsKernelBuilderPtr> &GetAllOpsKernelBuilders() const;
+
+  Status CalcOpRunningParam(Node &node) const;
+
+  Status GenerateTask(const Node &node, RunContext &context, std::vector<domi::TaskDef> &tasks) const;
+
+ private:
+  OpsKernelBuilderManager() = default;
+  static Status GetLibPaths(const std::map<std::string, std::string> &options, std::string &lib_paths);
+
+  std::unique_ptr<PluginManager> plugin_manager_;
+  std::map<std::string, OpsKernelBuilderPtr> ops_kernel_builders_{};
+};
+}  // namespace ge
+#endif  // GE_OPSKERNEL_MANAGER_OPS_KERNEL_BUILDER_MANAGER_H_
diff --git a/src/ge/opskernel_manager/ops_kernel_manager.cc b/src/ge/opskernel_manager/ops_kernel_manager.cc
index 11eb3061..346991c9 100644
--- a/src/ge/opskernel_manager/ops_kernel_manager.cc
+++ b/src/ge/opskernel_manager/ops_kernel_manager.cc
@@ -137,8 +137,8 @@ void OpsKernelManager::GetExternalEnginePath(std::string &extern_engine_path, co
   std::string so_path = "plugin/opskernel/";
   std::string path = path_base + so_path;
   extern_engine_path = (path + "libfe.so" + ":") + (path + "libge_local_engine.so" + ":") +
-                       (path + "librts_engine.so" + ":") + (path + "libaicpu_engine.so" + ":") +
-                       (path + "libhost_cpu_engine.so" + ":");
+                       (path + "librts_engine.so" + ":") + (path + "libaicpu_ascend_engine.so" + ":") +
+                       (path + "libhost_cpu_engine.so" + ":") + (path + "libaicpu_tf_engine.so" + ":");
   auto iter = options.find(OPTION_EXEC_HCCL_FLAG);
   if (iter == options.end() || iter->second != "0") {
     extern_engine_path += (path_base + "libhcom_graph_adaptor.so");
diff --git a/src/ge/opskernel_manager/optimizer_priority.pbtxt b/src/ge/opskernel_manager/optimizer_priority.pbtxt
index 9f8a03fb..1c9522c9 100755
--- a/src/ge/opskernel_manager/optimizer_priority.pbtxt
+++ b/src/ge/opskernel_manager/optimizer_priority.pbtxt
@@ -1 +1 @@
-optimizer:["aicpu_original_optimizer","AIcoreEngine","VectorEngine","aicpu_optimizer","hccl_graph_optimizer", "hvd_graph_optimizer", "DNN_VM_RTS_GRAPH_OPTIMIZER_STORE"]
\ No newline at end of file
+optimizer:["aicpu_tf_optimizer","AIcoreEngine","VectorEngine","aicpu_ascend_optimizer","hccl_graph_optimizer", "hvd_graph_optimizer", "DNN_VM_RTS_GRAPH_OPTIMIZER_STORE"]
diff --git a/src/ge/plugin/engine/dnnengines.cc b/src/ge/plugin/engine/dnnengines.cc
index d85d1668..531e7d99 100644
--- a/src/ge/plugin/engine/dnnengines.cc
+++ b/src/ge/plugin/engine/dnnengines.cc
@@ -55,7 +55,7 @@ void VectorCoreDNNEngine::GetAttributes(DNNEngineAttribute &attrs) const { attrs
 
 AICpuDNNEngine::AICpuDNNEngine(const std::string &engine_name) {
   engine_attribute_.engine_name = engine_name;
-  engine_attribute_.compute_cost = COST_2;
+  engine_attribute_.compute_cost = COST_3;
   engine_attribute_.runtime_type = DEVICE;
   engine_attribute_.engine_input_format = FORMAT_RESERVED;
   engine_attribute_.engine_output_format = FORMAT_RESERVED;
@@ -69,6 +69,22 @@ Status AICpuDNNEngine::Finalize() { return SUCCESS; }
 
 void AICpuDNNEngine::GetAttributes(DNNEngineAttribute &attrs) const { attrs = engine_attribute_; }
 
+AICpuTFDNNEngine::AICpuTFDNNEngine(const std::string &engine_name) {
+  engine_attribute_.engine_name = engine_name;
+  engine_attribute_.compute_cost = COST_2;
+  engine_attribute_.runtime_type = DEVICE;
+  engine_attribute_.engine_input_format = FORMAT_RESERVED;
+  engine_attribute_.engine_output_format = FORMAT_RESERVED;
+}
+
+AICpuTFDNNEngine::AICpuTFDNNEngine(const DNNEngineAttribute &attrs) { engine_attribute_ = attrs; }
+
+Status AICpuTFDNNEngine::Initialize(const std::map<std::string, std::string> &options) { return SUCCESS; }
+
+Status AICpuTFDNNEngine::Finalize() { return SUCCESS; }
+
+void AICpuTFDNNEngine::GetAttributes(DNNEngineAttribute &attrs) const { attrs = engine_attribute_; }
+
 GeLocalDNNEngine::GeLocalDNNEngine(const std::string &engine_name) {
   engine_attribute_.engine_name = engine_name;
   engine_attribute_.engine_input_format = FORMAT_RESERVED;
diff --git a/src/ge/plugin/engine/dnnengines.h b/src/ge/plugin/engine/dnnengines.h
index d776c2b9..3011a0fb 100644
--- a/src/ge/plugin/engine/dnnengines.h
+++ b/src/ge/plugin/engine/dnnengines.h
@@ -70,6 +70,21 @@ class AICpuDNNEngine : public DNNEngine {
   DNNEngineAttribute engine_attribute_;
 };
 
+class AICpuTFDNNEngine : public DNNEngine {
+ public:
+  AICpuTFDNNEngine() = default;
+  explicit AICpuTFDNNEngine(const std::string &engine_name);
+  explicit AICpuTFDNNEngine(const DNNEngineAttribute &attrs);
+  ~AICpuTFDNNEngine() = default;
+
+  Status Initialize(const std::map<std::string, std::string> &options);
+  Status Finalize();
+  void GetAttributes(DNNEngineAttribute &attr) const;
+
+ private:
+  DNNEngineAttribute engine_attribute_;
+};
+
 class GeLocalDNNEngine : public DNNEngine {
  public:
   GeLocalDNNEngine() = default;
diff --git a/src/ge/plugin/engine/engine_manage.cc b/src/ge/plugin/engine/engine_manage.cc
index 82cd90ee..a14c92ea 100644
--- a/src/ge/plugin/engine/engine_manage.cc
+++ b/src/ge/plugin/engine/engine_manage.cc
@@ -89,10 +89,10 @@ void RegisterVectorEngine() {
 }
 
 void RegisterAiCpuEngine() {
-  const std::string vm_aicpu = "DNN_VM_AICPU";
+  const std::string vm_aicpu = "DNN_VM_AICPU_ASCEND";
   std::vector<std::string> mem_type_aicpu;
   mem_type_aicpu.emplace_back(GE_ENGINE_ATTR_MEM_TYPE_HBM);
-  DNNEngineAttribute attr_aicpu = {vm_aicpu, mem_type_aicpu, COST_2, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED};
+  DNNEngineAttribute attr_aicpu = {vm_aicpu, mem_type_aicpu, COST_3, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED};
   DNNEnginePtr vm_engine_ptr = MakeShared<AICpuDNNEngine>(attr_aicpu);
   if (vm_engine_ptr == nullptr) {
     GELOGE(ge::FAILED, "make vm_engine_ptr failed");
@@ -103,6 +103,21 @@ void RegisterAiCpuEngine() {
   }
 }
 
+void RegisterAiCpuTFEngine() {
+  const std::string vm_aicpu_tf = "DNN_VM_AICPU";
+  std::vector<std::string> mem_type_aicpu_tf;
+  mem_type_aicpu_tf.emplace_back(GE_ENGINE_ATTR_MEM_TYPE_HBM);
+  DNNEngineAttribute attr_aicpu_tf = {vm_aicpu_tf, mem_type_aicpu_tf, COST_2, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED};
+  DNNEnginePtr vm_engine_ptr = MakeShared<AICpuTFDNNEngine>(attr_aicpu_tf);
+  if (vm_engine_ptr == nullptr) {
+    GELOGE(ge::FAILED, "make vm_engine_ptr failed");
+    return;
+  }
+  if (EngineManager::RegisterEngine(vm_aicpu_tf, vm_engine_ptr) != SUCCESS) {
+    GELOGW("register vmAicpuTFEngine failed");
+  }
+}
+
 void RegisterGeLocalEngine() {
   const std::string vm_ge_local = "DNN_VM_GE_LOCAL";
   std::vector<std::string> mem_type_ge_local;
@@ -168,6 +183,7 @@ void RegisterHcclEngine() {
 void GetDNNEngineObjs(std::map<std::string, DNNEnginePtr> &engines) {
   RegisterAiCoreEngine();
   RegisterVectorEngine();
+  RegisterAiCpuTFEngine();
   RegisterAiCpuEngine();
   RegisterGeLocalEngine();
   RegisterHostCpuEngine();
diff --git a/src/ge/session/inner_session.cc b/src/ge/session/inner_session.cc
index 44c29460..cf7f3047 100644
--- a/src/ge/session/inner_session.cc
+++ b/src/ge/session/inner_session.cc
@@ -15,9 +15,12 @@
  */
 
 #include "session/inner_session.h"
+
 #include <map>
 #include <memory>
 #include <vector>
+
+#include "analyzer/analyzer.h"
 #include "adx_datadump_server.h"
 #include "common/dump/dump_properties.h"
 #include "common/util.h"
@@ -25,6 +28,7 @@
 #include "graph/ge_context.h"
 #include "graph/ge_global_options.h"
 #include "graph/ge_local_context.h"
+#include "graph/common/local_context.h"
 #include "graph/load/new_model_manager/model_manager.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/utils/tensor_adapter.h"
@@ -53,7 +57,7 @@ Status CheckReuseMemoryOption(const std::map<string, string> &options) {
 static std::mutex mutex_;  // BuildGraph and RunGraph use
 bool InnerSession::is_dump_server_inited_ = false;
 InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options)
-    : init_flag_(false), session_id_(session_id), options_(options), graph_manager_(domi::GetContext()) {}
+    : init_flag_(false), session_id_(session_id), options_(options) {}
 
 Status InnerSession::Initialize() {
   if (init_flag_) {
@@ -125,6 +129,8 @@ Status InnerSession::Finalize() {
   // release var memory
   GELOGI("VarManager free var memory.");
   (void)VarManager::Instance(session_id_)->FreeVarMemory();
+  // release analyzer saved info(Session Level)
+  Analyzer::GetInstance()->DestroySessionJsonObject(session_id_);
 
   GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
   GE_CHK_STATUS_RET(RemoveDumpProperties(), "Remove dump properties failed");
@@ -150,7 +156,7 @@ Status InnerSession::AddGraph(uint32_t graph_id, const Graph &graph,
     return GE_SESS_INIT_FAILED;
   }
   UpdateThreadContext(options);
-  Status ret = graph_manager_.AddGraph(graph_id, graph, options);
+  Status ret = graph_manager_.AddGraph(graph_id, graph, options, domi::GetContext());
   if (ret != SUCCESS) {
     GELOGE(ret, "[InnerSession:%lu] add graph %u failed.", session_id_, graph_id);
     return ret;
@@ -273,6 +279,7 @@ void InnerSession::UpdateThreadContext(const std::map<std::string, std::string>
   GetThreadLocalContext().SetSessionOption(options_);
   GetThreadLocalContext().SetGraphOption(options);
   GetContext().SetSessionId(session_id_);
+  SetRtSocVersion();
 }
 
 void InnerSession::UpdateThreadContext(uint32_t graph_id) {
@@ -326,4 +333,17 @@ Status InnerSession::RemoveDumpProperties() {
   }
   return SUCCESS;
 }
+
+void InnerSession::SetRtSocVersion() {
+  const auto &global_options = GetMutableGlobalOptions();
+  auto it = global_options.find(ge::SOC_VERSION);
+  if (it != global_options.end()) {
+    const char *soc_version = it->second.c_str();
+    rtError_t rt_ret = rtSetSocVersion(soc_version);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("Set soc version %s failed. ret:0x%X", soc_version, rt_ret);
+    }
+    GELOGI("Set soc version %s success.", soc_version);
+  }
+}
 }  // namespace ge
diff --git a/src/ge/session/inner_session.h b/src/ge/session/inner_session.h
index 94d1ac12..9bb12b64 100644
--- a/src/ge/session/inner_session.h
+++ b/src/ge/session/inner_session.h
@@ -67,6 +67,8 @@ class InnerSession {
 
   Status RemoveDumpProperties();
 
+  void SetRtSocVersion();
+
  private:
   bool init_flag_;
   uint64_t session_id_;
diff --git a/src/ge/session/omg.cc b/src/ge/session/omg.cc
index 0fb342e1..2d9867e7 100644
--- a/src/ge/session/omg.cc
+++ b/src/ge/session/omg.cc
@@ -19,7 +19,6 @@
 #include <iostream>
 #include <memory>
 #include "common/auth/file_saver.h"
-#include "common/convert/pb2json.h"
 #include "common/debug/log.h"
 #include "common/debug/memory_dumper.h"
 #include "common/ge/ge_util.h"
@@ -45,6 +44,7 @@
 #include "omg/parser/parser_factory.h"
 #include "omg/parser/weights_parser.h"
 #include "parser/common/pre_checker.h"
+#include "parser/common/convert/pb2json.h"
 #include "proto/ge_ir.pb.h"
 #include "register/op_registry.h"
 
@@ -257,6 +257,11 @@ void FindParserSo(const string &path, vector<string> &file_list, string &caffe_p
   if (real_path.empty()) {  // plugin path does not exist
     return;
   }
+  struct stat stat_buf;
+  if ((stat(real_path.c_str(), &stat_buf) != 0) || (!S_ISDIR(stat_buf.st_mode))) {
+    GELOGI("The path %s is not a directory.", real_path.c_str());
+    return;
+  }
 
   struct dirent *dent(nullptr);
   DIR *dir = opendir(real_path.c_str());
@@ -272,21 +277,11 @@ void FindParserSo(const string &path, vector<string> &file_list, string &caffe_p
     string full_name = real_path + "/" + name;
     const string so_suff = ".so";
     const string caffe_parser_so_suff = "lib_caffe_parser.so";
-    const string aicpu_so_suff = "_aicpu.so";
-    const string aicpu_host_so_suff = "_online.so";
     if (name.size() >= so_suff.size() && name.compare(name.size() - so_suff.size(), so_suff.size(), so_suff) == 0) {
       if (full_name.size() >= caffe_parser_so_suff.size() &&
           full_name.compare(full_name.size() - caffe_parser_so_suff.size(), caffe_parser_so_suff.size(),
                             caffe_parser_so_suff) == 0) {
         caffe_parser_path = full_name;
-      } else if ((full_name.size() >= aicpu_so_suff.size() &&
-                  full_name.compare(full_name.size() - aicpu_so_suff.size(), aicpu_so_suff.size(), aicpu_so_suff) ==
-                    0) ||
-                 (full_name.size() >= aicpu_host_so_suff.size() &&
-                  full_name.compare(full_name.size() - aicpu_host_so_suff.size(), aicpu_host_so_suff.size(),
-                                    aicpu_host_so_suff) == 0)) {
-        // aicpu so, Put the file path into the omgcontext and save into the model in the builder stage;
-        domi::GetContext().aicpu_op_run_paths.push_back(full_name);
       } else {  // save parser so path into file_list vector
         file_list.push_back(full_name);
       }
@@ -299,29 +294,6 @@ void FindParserSo(const string &path, vector<string> &file_list, string &caffe_p
   return;
 }
 
-Status CheckCustomAiCpuOpLib() {
-  std::vector<std::string> vec_op_type;
-  domi::OpRegistry::Instance()->GetOpTypeByImplyType(vec_op_type, domi::ImplyType::CUSTOM);
-  for (uint32_t i = 0; i < vec_op_type.size(); i++) {
-    bool aicpu_so_exist = false;
-    std::string ai_cpu_so_name = "lib" + vec_op_type[i] + "_aicpu.so";
-    for (uint32_t j = 0; j < domi::GetContext().aicpu_op_run_paths.size(); j++) {
-      string bin_file_path = domi::GetContext().aicpu_op_run_paths[j];
-      if (bin_file_path.size() >= ai_cpu_so_name.size() &&
-          bin_file_path.compare(bin_file_path.size() - ai_cpu_so_name.size(), ai_cpu_so_name.size(), ai_cpu_so_name) ==
-            0) {
-        aicpu_so_exist = true;
-        break;
-      }
-    }
-    if (!aicpu_so_exist) {
-      GELOGE(domi::FAILED, "cant find aicpu run so(%s), please check the plugin path!", ai_cpu_so_name.c_str());
-      return domi::FAILED;
-    }
-  }
-  return domi::SUCCESS;
-}
-
 Status SetOutFormatAndDataTypeAttr(ge::OpDescPtr op_desc, const ge::Format format, const ge::DataType data_type) {
   if (op_desc == nullptr) {
     GELOGE(domi::FAILED, "Input op desc invalid.");
@@ -455,6 +427,32 @@ Status CheckOutNode(ge::OpDescPtr op_desc, int32_t index) {
   }
   return domi::SUCCESS;
 }
+Status GetDefaultOutInfo(ge::ComputeGraphPtr &compute_graph,
+                         std::vector<std::pair<ge::NodePtr, int32_t>> &output_nodes_info) {
+  std::vector<std::pair<std::string, int32_t>> default_out_nodes = domi::GetContext().default_out_nodes;
+  if (domi::GetContext().type == domi::CAFFE && !default_out_nodes.empty()) {
+    for (uint32_t i = 0; i < default_out_nodes.size(); ++i) {
+      ge::NodePtr out_node = compute_graph->FindNode(default_out_nodes[i].first);
+      if (out_node == nullptr) {
+        ErrorManager::GetInstance().ATCReportErrMessage("E10016", {"parameter", "opname"},
+                                                        {"out_nodes", default_out_nodes[i].first});
+        GELOGE(domi::FAILED, "Can not find src node (%s) in graph.", default_out_nodes[i].first.c_str());
+        return domi::FAILED;
+      }
+      output_nodes_info.push_back(std::make_pair(out_node, default_out_nodes[i].second));
+      GELOGD("Get default output node:%s.", out_node->GetName().c_str());
+    }
+    return domi::SUCCESS;
+  }
+
+  for (ge::NodePtr node : compute_graph->GetDirectNode()) {
+    if (!node->GetInAllNodes().empty() && node->GetOutAllNodes().empty()) {
+      Status ret = GetOutputLeaf(node, output_nodes_info);
+      GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, ret, "find leaf fail.");
+    }
+  }
+  return domi::SUCCESS;
+}
 
 Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const std::string &output) {
   ge::ComputeGraphPtr compute_graph = ge::GraphUtils::GetComputeGraph(graph);
@@ -505,11 +503,9 @@ Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const
   }
   // default output node (leaf)
   if (user_out_nodes.empty()) {
-    for (ge::NodePtr node : compute_graph->GetDirectNode()) {
-      if (!node->GetInAllNodes().empty() && node->GetOutAllNodes().empty()) {
-        Status ret = GetOutputLeaf(node, output_nodes_info);
-        GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, ret, "find leaf fail.");
-      }
+    if (GetDefaultOutInfo(compute_graph, output_nodes_info) != SUCCESS) {
+      GELOGE(domi::FAILED, "Get default output info failed.");
+      return domi::FAILED;
     }
   }
   GetOutputNodesNameAndIndex(output_nodes_info, output_nodes_name);
@@ -553,6 +549,7 @@ Status GetOutputLeaf(NodePtr node, std::vector<std::pair<ge::NodePtr, int32_t>>
   if (node->GetType() != NETOUTPUT) {
     for (size_t index = 0; index < size; ++index) {
       output_nodes_info.push_back(std::make_pair(node, index));
+      GELOGD("Get output leaf node:%s.", node->GetName().c_str());
     }
   } else {
     const auto in_anchors = node->GetAllInDataAnchors();
@@ -882,65 +879,66 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
 
   uint8_t *model_data = nullptr;
   uint32_t model_len = 0;
-
-  // Parse the contents of the file to get the modeldef object
-  ret = ModelParserBase::ParseModelContent(model, model_data, model_len);
-  if (ret == SUCCESS) {
-    OmFileLoadHelper omFileLoadHelper;
-    ge::graphStatus status = omFileLoadHelper.Init(model_data, model_len);
-    if (status != ge::GRAPH_SUCCESS) {
-      GELOGE(ge::FAILED, "Om file init failed.");
-      if (model.model_data != nullptr) {
-        delete[](char *) model.model_data;
-        model.model_data = nullptr;
+  try {
+    // Parse the contents of the file to get the modeldef object
+    ret = ModelParserBase::ParseModelContent(model, model_data, model_len);
+    if (ret == SUCCESS) {
+      OmFileLoadHelper omFileLoadHelper;
+      ge::graphStatus status = omFileLoadHelper.Init(model_data, model_len);
+      if (status != ge::GRAPH_SUCCESS) {
+        GELOGE(ge::FAILED, "Om file init failed.");
+        if (model.model_data != nullptr) {
+          delete[](char *) model.model_data;
+          model.model_data = nullptr;
+        }
+        return status;
       }
-      return status;
-    }
 
-    ModelPartition ir_part;
-    status = omFileLoadHelper.GetModelPartition(MODEL_DEF, ir_part);
-    if (status != ge::GRAPH_SUCCESS) {
-      GELOGE(ge::FAILED, "Get model part failed.");
-      if (model.model_data != nullptr) {
-        delete[](char *) model.model_data;
-        model.model_data = nullptr;
+      ModelPartition ir_part;
+      status = omFileLoadHelper.GetModelPartition(MODEL_DEF, ir_part);
+      if (status != ge::GRAPH_SUCCESS) {
+        GELOGE(ge::FAILED, "Get model part failed.");
+        if (model.model_data != nullptr) {
+          delete[](char *) model.model_data;
+          model.model_data = nullptr;
+        }
+        return status;
       }
-      return status;
-    }
 
-    ge::proto::ModelDef model_def;
+      ge::proto::ModelDef model_def;
 
-    // De serialization
-    bool flag = ReadProtoFromArray(ir_part.data, ir_part.size, &model_def);
-    if (flag) {
-      GetGroupName(model_def);
+      // De serialization
+      bool flag = ReadProtoFromArray(ir_part.data, ir_part.size, &model_def);
+      if (flag) {
+        GetGroupName(model_def);
 
-      json j;
-      Pb2Json::Message2Json(model_def, kOmBlackFields, j, true);
+        json j;
+        Pb2Json::Message2Json(model_def, kOmBlackFields, j, true);
 
-      ret = ModelSaver::SaveJsonToFile(json_file, j);
+        ret = ModelSaver::SaveJsonToFile(json_file, j);
+      } else {
+        ret = INTERNAL_ERROR;
+        GELOGE(ret, "ReadProtoFromArray failed.");
+      }
     } else {
-      ret = INTERNAL_ERROR;
-      GELOGE(ret, "ReadProtoFromArray failed.");
+      GELOGE(PARAM_INVALID, "ParseModelContent failed because of invalid om file. Please check --om param.");
     }
-  } else {
-    GELOGE(PARAM_INVALID, "ParseModelContent failed because of invalid om file. Please check --om param.");
-  }
 
-  if (model.model_data != nullptr) {
-    delete[](char *) model.model_data;
-    model.model_data = nullptr;
+    if (model.model_data != nullptr) {
+      delete[](char *) model.model_data;
+      model.model_data = nullptr;
+    }
+    return ret;
+  } catch (const std::exception &e) {
+    GELOGE(FAILED, "Convert om model to json failed, exception message : %s.", e.what());
+    return FAILED;
   }
-
-  return ret;
 }
 
 FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const char *json_file) {
   ge::ModelData model;
-
   // Mode 2 does not need to verify the priority, and a default value of 0 is passed
   int32_t priority = 0;
-
   // Load model from file
   Status ret = ModelParserBase::LoadFromFile(model_file, "", priority, model);
   auto free_model_data = [](void **ptr) -> void {
@@ -954,35 +952,36 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const
     GELOGE(ret, "LoadFromFile failed.");
     return ret;
   }
-  bool flag = false;
-  ge::proto::ModelDef model_def;
+
   try {
+    bool flag = false;
+    ge::proto::ModelDef model_def;
     flag = google::protobuf::TextFormat::ParseFromString(reinterpret_cast<char *>(model.model_data), &model_def);
+
+    if (!flag) {
+      free_model_data(&model.model_data);
+      GELOGE(FAILED, "ParseFromString fail.");
+      return FAILED;
+    }
+    GetGroupName(model_def);
+    json j;
+    Pb2Json::Message2Json(model_def, kOmBlackFields, j, true);
+    ret = ModelSaver::SaveJsonToFile(json_file, j);
+    if (ret != SUCCESS) {
+      free_model_data(&model.model_data);
+      GELOGE(ret, "Save json to file fail.");
+      return ret;
+    }
+    free_model_data(&model.model_data);
+    return SUCCESS;
   } catch (google::protobuf::FatalException &e) {
     free_model_data(&model.model_data);
     GELOGE(FAILED, "ParseFromString fail. exception message : %s", e.what());
     return FAILED;
-  }
-
-  if (!flag) {
-    free_model_data(&model.model_data);
-    GELOGE(FAILED, "ParseFromString fail.");
+  } catch (const std::exception &e) {
+    GELOGE(FAILED, "Convert pbtxt to json failed, exception message : %s.", e.what());
     return FAILED;
   }
-
-  GetGroupName(model_def);
-  json j;
-  Pb2Json::Message2Json(model_def, kOmBlackFields, j, true);
-  ret = ModelSaver::SaveJsonToFile(json_file, j);
-  if (ret != SUCCESS) {
-    free_model_data(&model.model_data);
-    GELOGE(ret, "Save json to file fail.");
-    return ret;
-  }
-
-  free_model_data(&model.model_data);
-
-  return SUCCESS;
 }
 
 FMK_FUNC_HOST_VISIBILITY Status ConvertFwkModelToJson(const domi::FrameworkType framework, const char *model_file,
@@ -1028,13 +1027,33 @@ FMK_FUNC_HOST_VISIBILITY Status DumpInfershapeJson(const ge::Graph &graph, const
 void UpdateOmgCtxWithParserCtx() {
   domi::GetContext().format = GetParserContext().format;
   domi::GetContext().input_dims = GetParserContext().input_dims;
-  return;
+  domi::GetContext().user_input_dims = GetParserContext().user_input_dims;
+  domi::GetContext().is_dynamic_input = GetParserContext().is_dynamic_input;
+  domi::GetContext().type = GetParserContext().type;
+  domi::GetContext().user_out_nodes = GetParserContext().user_out_nodes;
+  domi::GetContext().train_flag = GetParserContext().train_flag;
+  domi::GetContext().run_mode = GetParserContext().run_mode;
+  domi::GetContext().op_conf_map = GetParserContext().op_conf_map;
+  domi::GetContext().out_nodes_map = GetParserContext().out_nodes_map;
+  domi::GetContext().input_nodes_format_map = GetParserContext().input_nodes_format_map;
+  domi::GetContext().out_top_names = GetParserContext().out_top_names;
+  domi::GetContext().user_out_nodes_top_vec = GetParserContext().user_out_nodes_top_vec;
+  domi::GetContext().default_out_nodes = GetParserContext().default_out_nodes;
 }
 
 void UpdateParserCtxWithOmgCtx() {
   GetParserContext().format = domi::GetContext().format;
   GetParserContext().input_dims = domi::GetContext().input_dims;
+  GetParserContext().user_input_dims = domi::GetContext().user_input_dims;
+  GetParserContext().is_dynamic_input = domi::GetContext().is_dynamic_input;
+  GetParserContext().type = domi::GetContext().type;
+  GetParserContext().user_out_nodes = domi::GetContext().user_out_nodes;
+  GetParserContext().train_flag = domi::GetContext().train_flag;
   GetParserContext().run_mode = domi::GetContext().run_mode;
-  return;
+  GetParserContext().op_conf_map = domi::GetContext().op_conf_map;
+  GetParserContext().out_nodes_map = domi::GetContext().out_nodes_map;
+  GetParserContext().input_nodes_format_map = domi::GetContext().input_nodes_format_map;
+  GetParserContext().out_top_names = domi::GetContext().out_top_names;
+  GetParserContext().user_out_nodes_top_vec = domi::GetContext().user_out_nodes_top_vec;
 }
 }  // namespace ge
diff --git a/src/ge/single_op/task/aicpu_task_builder.cc b/src/ge/single_op/task/aicpu_task_builder.cc
index 9ad52d81..aa26d049 100644
--- a/src/ge/single_op/task/aicpu_task_builder.cc
+++ b/src/ge/single_op/task/aicpu_task_builder.cc
@@ -111,14 +111,11 @@ Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_worksp
 
 Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag,
                                    uint64_t session_id) {
-  void *io_addr = nullptr;
-  void *kernel_workspace = nullptr;
-  GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&io_addr, &kernel_workspace, param, dynamic_flag));
+  GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag));
 
   STR_FWK_OP_KERNEL fwk_op_kernel = {0};
-  auto ret = SetFmkOpKernel(io_addr, kernel_workspace, fwk_op_kernel);
+  auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel);
   if (ret != SUCCESS) {
-    (void)rtFree(io_addr);
     return ret;
   }
 
@@ -149,15 +146,12 @@ Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam
                   return FAILED;)
   ret = SetKernelArgs(&task.args_, fwk_op_kernel);
   if (ret != SUCCESS) {
-    (void)rtFree(io_addr);
     return ret;
   }
 
   task.arg_size_ = sizeof(STR_FWK_OP_KERNEL);
   task.op_type_ = op_desc_->GetName();
-  task.io_addr_ = io_addr;
   task.task_info_ = kernel_def_.task_info();
-  task.workspace_addr_ = kernel_workspace;
   task.dynamic_flag_ = dynamic_flag;
 
   auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_);
diff --git a/src/ge/stub/Makefile b/src/ge/stub/Makefile
deleted file mode 100644
index 820fc70d..00000000
--- a/src/ge/stub/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-inc_path := $(shell pwd)/inc/external/
-out_path := $(shell pwd)/out/ge/lib64/stub/
-stub_path := $(shell pwd)/framework/domi/stub/
-
-mkdir_stub := $(shell mkdir -p $(out_path))
-local_stub := $(shell $(HI_PYTHON) $(stub_path)/gen_stubapi.py $(inc_path) $(out_path))
diff --git a/src/ge/stub/README b/src/ge/stub/README
deleted file mode 100644
index ca98ce85..00000000
--- a/src/ge/stub/README
+++ /dev/null
@@ -1,4 +0,0 @@
-###################################################################################
-the directory (stub) saves the stub file
-gen_stubapi.py is using for retrieving API and generating stub functions
-###################################################################################
diff --git a/src/ge/stub/README.md b/src/ge/stub/README.md
deleted file mode 100755
index a085e537..00000000
--- a/src/ge/stub/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# "stub"  usage:
-
-## Description
-
-- File libge_compiler.so ,libgraph.so are used in IR build application interface.
-
-# Attention
-
-- Don't link other library except libge_compiler.so ,libgraph.so, as they may be changed in the future.
-
-# Usage
-
-## Compile:   compile  the application invoking the IR build API.
-
-Makefile:
-
-'''
-
-ATC_INCLUDE_DIR := $(ASCEND_PATH)/atc/include
-OPP_INCLUDE_DIR := $(ASCEND_PATH)/opp/op_proto/built-in/inc
-LOCAL_MODULE_NAME := ir_build
-CC := g++
-CFLAGS := -std=c++11 -g -Wall
-SRCS := $(wildcard $(LOCAL_DIR)/main.cpp)
-INCLUDES := -I $(ASCEND_OPP_PATH)/op_proto/built-in/inc \
-            -I $(ATC_INCLUDE_DIR)/graph \
-            -I $(ATC_INCLUDE_DIR)/ge \
-
-LIBS := -L ${ASCEND_PATH}/atc/lib64/stub \
-    -lgraph \
-    -lge_compiler
-ir_build:
-    mkdir -p out
-    $(CC) $(SRCS) $(INCLUDES) $(LIBS) $(CFLAGS) -o ./out/$(LOCAL_MODULE_NAME)
-clean:
-    rm -rf out
-
-'''
-make
-
-## Run the application after set the LD_LIBRARY_PATH to include the real path of the library which locates in the directory of atc/lib64
-
-export LD_LIBRARY_PATH= $(ASCEND_PATH)/atc/lib64
- -  ./ ir_build
diff --git a/src/ge/stub/gen_stubapi.py b/src/ge/stub/gen_stubapi.py
deleted file mode 100644
index b6e1e70c..00000000
--- a/src/ge/stub/gen_stubapi.py
+++ /dev/null
@@ -1,578 +0,0 @@
-import os
-import re
-import sys
-import logging
-
-logging.basicConfig(stream=sys.stdout, format='[%(asctime)s] [%(lineno)s] %(levelname)s: %(message)s',
-                    level=logging.INFO)
-
-"""
-    this attr is used for symbol table visible
-"""
-GE_ATTR = 'GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY'
-
-"""
-    generate stub func body by return type
-"""
-RETURN_STATEMENTS = {
-    'graphStatus': '    std::cout << "[ERROR]: stub library libgraph or libge_compiler cannot be used for execution, please check your "\n '
-                   '        << "environment variables and compilation options to make sure you use the correct library."\n'
-                   '        << std::endl;\n'
-                   '    return ACL_ERROR_COMPILING_STUB_MODE;',
-    'Status': '    return SUCCESS;',
-    'Graph': '    return Graph();',
-    'Graph&': '    return *this;',
-    'Format': '    return Format();',
-    'Format&': '    return *this;',
-    'Shape': '    return Shape();',
-    'Shape&': '    return *this;',
-    'TensorDesc': '    return TensorDesc();',
-    'TensorDesc&': '    return *this;',
-    'Tensor': '    return Tensor();',
-    'Tensor&': '    return *this;',
-    'Operator': '    return Operator();',
-    'Operator&': '    return *this;',
-    'Ptr': '    return nullptr;',
-    'std::string': '    return "";',
-    'std::string&': '    return "";',
-    'string': ' return "";',
-    'int': '    return 0;',
-    'DataType': '    return DT_FLOAT;',
-    'InferenceContextPtr': '    return nullptr;',
-    'SubgraphBuilder': '    return nullptr;',
-    'OperatorImplPtr': '    return nullptr;',
-    'OutHandler': '    return nullptr;',
-    'std::vector<std::string>': '    return {};',
-    'std::vector<int64_t>': '    return {};',
-    'std::map': '    return {};',
-    'uint32_t': '    return 0;',
-    'int64_t': '    return 0;',
-    'uint64_t': '    return 0;',
-    'size_t': '    return 0;',
-    'float': '    return 0.0f;',
-    'bool': '    return false;',
-}
-
-"""
-    max code len per line in hua_wei software programming specifications
-"""
-max_code_len_per_line = 100
-
-"""
-    white_list_for_debug, include_dir_key_words is to
-    determines which header files to generate cc files from
-    when DEBUG on
-"""
-white_list_for_debug = ["attr_value.h", "operator.h", "tensor.h", "graph.h", "operator_factory.h",
-                        "ge_ir_build.h", "ge_api.h", "tensorflow_parser.h", "caffe_parser.h"]
-include_dir_key_words = ["ge", "graph", "parser"]
-DEBUG = True
-
-
-def need_generate_func(func_line):
-    """
-    :param func_line:
-    :return:
-    """
-    if func_line.strip().endswith("default") or func_line.strip().endswith("delete") \
-            or func_line.strip().startswith("typedef") or func_line.strip().startswith("using"):
-        return False
-    return True
-
-
-def file_endswith_white_list_suffix(file):
-    """
-    :param file:
-    :return:
-    """
-    if DEBUG:
-        for suffix in white_list_for_debug:
-            if file.endswith(suffix):
-                return True
-        return False
-    else:
-        return True
-
-
-"""
-    belows are patterns used for analyse .h file
-"""
-# pattern function
-pattern_func = re.compile(r"""(^[\s]*)          #leading with space,we will find and delete after
-([a-zA-Z~_]            # void int likely
-.*
-[)]                     #we find )
-(?!.*{)                 # we do not want the case int abc() const { return 1;}
-.*)
-(;.*)                   #we want to find ; and after for we will replace these later
-\n$
-""", re.VERBOSE | re.MULTILINE | re.DOTALL)
-
-# pattern comment
-pattern_comment = re.compile(r'^\s*//')
-pattern_comment_2_start = re.compile(r'^\s*/[*]')
-pattern_comment_2_end = re.compile(r'[*]/\s*$')
-# pattern define
-pattern_define = re.compile(r'^\s*#define')
-pattern_define_return = re.compile(r'\\\s*$')
-# blank line
-pattern_blank_line = re.compile(r'^\s*$')
-# virtual,explicit,friend,static
-pattern_keyword = re.compile(r'(virtual\s+|explicit\s+|friend\s+|static\s+)')
-# lead space
-pattern_leading_space = re.compile(r'(^[\s]*)[a-zA-Z~_]')
-# functions will have patterns such as func ( or func(
-# but operator is an exception; the class name is preceded by an operator, and the above mode does not exist
-# format like :"operator = ()"
-pattern_func_name = re.compile(r'([a-zA-Z0-9~_\-]+\s*|operator?.*)[(]')
-# template
-pattern_template = re.compile(r'^\s*template')
-pattern_template_end = re.compile(r'>\s*$')
-# namespace
-pattern_namespace = re.compile(r'namespace.*{')
-# class : which can handle classA a and {not on the same line, but if found ';' after class,then don't deal with
-pattern_class = re.compile(r'^[\s]*(class|struct)\s+(%s\s+)?([a-zA-Z0-9_\-]+<?)(?!.*;)' % GE_ATTR)
-# {}
-pattern_start = re.compile('{')
-pattern_end = re.compile('}')
-
-line_index = 0
-
-
-class H2CC(object):
-    def __init__(self, input_file, output_file, shared_includes_content):
-        """
-        :param input_file:
-        :param output_file:
-        :param shared_includes_content:
-        """
-        self.input_file = input_file
-        self.output_file = output_file
-        self.shared_includes_content = shared_includes_content
-        self.line_index = 0
-        self.input_fd = open(self.input_file, 'r')
-        self.input_content = self.input_fd.readlines()
-        self.output_fd = open(self.output_file, 'w')
-
-        # The state may be normal_now(in the middle of {}),class_now,namespace_now
-        self.stack = []
-        self.stack_class = []
-        self.stack_template = []
-        # record funcs generated by h2cc func
-        self.func_list_exist = []
-
-    def __del__(self):
-        self.input_fd.close()
-        self.output_fd.close()
-        del self.stack
-        del self.stack_class
-        del self.stack_template
-        del self.func_list_exist
-
-    def just_skip(self):
-        # skip blank line or comment
-        if pattern_blank_line.search(self.input_content[self.line_index]) or pattern_comment.search(
-                self.input_content[self.line_index]):  # /n or comment using //
-            self.line_index += 1
-        if pattern_comment_2_start.search(self.input_content[self.line_index]):  # comment using /*
-            while not pattern_comment_2_end.search(self.input_content[self.line_index]):  # */
-                self.line_index += 1
-            self.line_index += 1
-        # skip define
-        if pattern_define.search(self.input_content[self.line_index]):
-            while pattern_blank_line.search(self.input_content[self.line_index]) or pattern_define_return.search(
-                    self.input_content[self.line_index]):
-                self.line_index += 1
-            self.line_index += 1
-
-    def write_inc_content(self):
-        for shared_include_content in self.shared_includes_content:
-            self.output_fd.write(shared_include_content)
-
-    def h2cc(self):
-        """
-        :return:
-        """
-        logging.info("start generate cc_file[%s] from h_file[%s]", self.output_file, self.input_file)
-        global pattern_comment
-        global pattern_comment_2_start
-        global pattern_comment_2_end
-        global pattern_blank_line
-        global pattern_func
-        global pattern_keyword
-        global pattern_leading_space
-        global pattern_func_name
-        global pattern_template
-        global pattern_template_end
-        global pattern_namespace
-        global pattern_class
-        global pattern_start
-        global pattern_end
-        global line_index
-        # write inc content
-        self.write_inc_content()
-        # core processing cycle, process the input .h file by line
-        while self.line_index < len(self.input_content):
-            # handle comment and blank line
-            self.just_skip()
-
-            # match namespace
-            self.handle_namespace()
-
-            # match template
-            template_string = self.handle_template()
-            # match class
-            line = self.input_content[self.line_index]
-            match_class = pattern_class.search(line)
-            match_start = pattern_start.search(line)
-            handle_class_result = self.handle_class(template_string, line, match_start, match_class)
-            if handle_class_result == "continue":
-                continue
-
-            # match "}"
-            handle_stack_result = self.handle_stack(match_start)
-            if handle_stack_result == "continue":
-                continue
-            # handle func
-            handle_func1_result, line, start_i = self.handle_func1(line)
-            if handle_func1_result == "continue":
-                continue
-
-            # here means func is found
-            # delete key word
-            line = pattern_keyword.sub('', line)
-            logging.info("line[%s]", line)
-
-            # Class member function
-            # if friend we will not add class name
-            friend_match = re.search('friend ', line)
-            if len(self.stack_class) > 0 and not friend_match:
-                line, func_name = self.handle_class_member_func(line, template_string)
-            # Normal functions
-            else:
-                line, func_name = self.handle_normal_func(line, template_string)
-
-            need_generate = need_generate_func(line)
-            # func body
-            line += self.implement_function(line)
-            # comment
-            line = self.gen_comment(start_i) + line
-            # write to out file
-            self.write_func_content(line, func_name, need_generate)
-            # next loop
-            self.line_index += 1
-
-        logging.info('Added %s functions', len(self.func_list_exist))
-        logging.info('Successfully converted,please see ' + self.output_file)
-
-    def handle_func1(self, line):
-        """
-        :param line:
-        :return:
-        """
-        find1 = re.search('[(]', line)
-        if not find1:
-            self.line_index += 1
-            return "continue", line, None
-        find2 = re.search('[)]', line)
-        start_i = self.line_index
-        space_match = pattern_leading_space.search(line)
-        # deal with
-        # int abc(int a,
-        #        int b)
-        if find1 and (not find2):
-            self.line_index += 1
-            line2 = self.input_content[self.line_index]
-            if space_match:
-                line2 = re.sub('^' + space_match.group(1), '', line2)
-            line += line2
-            while self.line_index < len(self.input_content) and (not re.search('[)]', line2)):
-                self.line_index += 1
-                line2 = self.input_content[self.line_index]
-                line2 = re.sub('^' + space_match.group(1), '', line2)
-                line += line2
-
-        match_start = pattern_start.search(self.input_content[self.line_index])
-        match_end = pattern_end.search(self.input_content[self.line_index])
-        if match_start:  # like  ) {  or ) {}    int the last line
-            if not match_end:
-                self.stack.append('normal_now')
-            ii = start_i
-            while ii <= self.line_index:
-                ii += 1
-            self.line_index += 1
-            return "continue", line, start_i
-        logging.info("line[%s]", line)
-        # '  int abc();'->'int abc()'
-        (line, match) = pattern_func.subn(r'\2\n', line)
-        logging.info("line[%s]", line)
-        # deal with case:
-        # 'int \n abc(int a, int b)'
-        if re.search(r'^\s*(inline)?\s*[a-zA-Z0-9_]+\s*$', self.input_content[start_i - 1]):
-            line = self.input_content[start_i - 1] + line
-        line = line.lstrip()
-        if not match:
-            self.line_index += 1
-            return "continue", line, start_i
-        return "pass", line, start_i
-
-    def handle_stack(self, match_start):
-        """
-        :param match_start:
-        :return:
-        """
-        line = self.input_content[self.line_index]
-        match_end = pattern_end.search(line)
-        if match_start:
-            self.stack.append('normal_now')
-        if match_end:
-            top_status = self.stack.pop()
-            if top_status == 'namespace_now':
-                self.output_fd.write(line + '\n')
-            elif top_status == 'class_now':
-                self.stack_class.pop()
-                self.stack_template.pop()
-        if match_start or match_end:
-            self.line_index += 1
-            return "continue"
-
-        if len(self.stack) > 0 and self.stack[-1] == 'normal_now':
-            self.line_index += 1
-            return "continue"
-        return "pass"
-
-    def handle_class(self, template_string, line, match_start, match_class):
-        """
-        :param template_string:
-        :param line:
-        :param match_start:
-        :param match_class:
-        :return:
-        """
-        if match_class:  # we face a class
-            self.stack_template.append(template_string)
-            self.stack.append('class_now')
-            class_name = match_class.group(3)
-
-            # class template specializations: class A<u,Node<u> >
-            if '<' in class_name:
-                k = line.index('<')
-                fit = 1
-                for ii in range(k + 1, len(line)):
-                    if line[ii] == '<':
-                        fit += 1
-                    if line[ii] == '>':
-                        fit -= 1
-                    if fit == 0:
-                        break
-                class_name += line[k + 1:ii + 1]
-            logging.info('class_name[%s]', class_name)
-            self.stack_class.append(class_name)
-            while not match_start:
-                self.line_index += 1
-                line = self.input_content[self.line_index]
-                match_start = pattern_start.search(line)
-            self.line_index += 1
-            return "continue"
-        return "pass"
-
-    def handle_template(self):
-        line = self.input_content[self.line_index]
-        match_template = pattern_template.search(line)
-        template_string = ''
-        if match_template:
-            match_template_end = pattern_template_end.search(line)
-            template_string = line
-            while not match_template_end:
-                self.line_index += 1
-                line = self.input_content[self.line_index]
-                template_string += line
-                match_template_end = pattern_template_end.search(line)
-            self.line_index += 1
-        return template_string
-
-    def handle_namespace(self):
-        line = self.input_content[self.line_index]
-        match_namespace = pattern_namespace.search(line)
-        if match_namespace:  # we face namespace
-            self.output_fd.write(line + '\n')
-            self.stack.append('namespace_now')
-            self.line_index += 1
-
-    def handle_normal_func(self, line, template_string):
-        template_line = ''
-        self.stack_template.append(template_string)
-        if self.stack_template[-1] != '':
-            template_line = re.sub(r'\s*template', 'template', self.stack_template[-1])
-            # change '< class T = a, class U = A(3)>' to '<class T, class U>'
-            template_line = re.sub(r'\s*=.*>(\s*)$', r'>\1', template_line)
-            template_line = re.sub(r'\s*=.*,', ',', template_line)
-            template_line = re.sub(r'\s*=.*', '', template_line)
-        line = re.sub(r'\s*=.*,', ',', line)
-        line = re.sub(r'\s*=.*\)', ')', line)
-        line = template_line + line
-        self.stack_template.pop()
-        func_name = re.search(r'^.*\)', line, re.MULTILINE | re.DOTALL).group()
-        logging.info("line[%s]", line)
-        logging.info("func_name[%s]", func_name)
-        return line, func_name
-
-    def handle_class_member_func(self, line, template_string):
-        template_line = ''
-        x = ''
-        if template_string != '':
-            template_string = re.sub(r'\s*template', 'template', template_string)
-            template_string = re.sub(r'\s*=.*>(\s*)$', r'>\1', template_string)
-            template_string = re.sub(r'\s*=.*,', ',', template_string)
-            template_string = re.sub(r'\s*=.*', '', template_string)
-        if self.stack_template[-1] != '':
-            if not (re.search(r'<\s*>', stack_template[-1])):
-                template_line = re.sub(r'^\s*template', 'template', stack_template[-1])
-                if not (re.search(r'<.*>', self.stack_class[-1])):
-                    # for x we get like template<class T, typename U> -> <T,U>
-                    x = re.sub(r'template\s*<', '<', template_line)  # remove template -> <class T, typename U>
-                    x = re.sub(r'\n', '', x)
-                    x = re.sub(r'\s*=.*,', ',', x)
-                    x = re.sub(r'\s*=.*\>', '>', x)
-                    x = x.rstrip()  # remove \n
-                    x = re.sub(r'(class|typename)\s+|(<class>|<typename>\s*class)', '',
-                               x)  # remove class,typename ->  <T, U>
-                    x = re.sub(r'<\s+', '<', x)
-                    x = re.sub(r'\s+>', '>', x)
-                    x = re.sub(r'\s+,', ',', x)
-                    x = re.sub(r',\s+', ', ', x)
-        line = re.sub(r'\s*=\s+0', '', line)
-        line = re.sub(r'\s*=\s+.*,', ',', line)
-        line = re.sub(r'\s*=\s+.*\)', ')', line)
-        logging.info("x[%s]\nline[%s]", x, line)
-        # if the function is long, void ABC::foo()
-        # breaks into two lines void ABC::\n foo()
-        temp_line = pattern_func_name.sub(self.stack_class[-1] + x + '::' + r'\1(', line, count=1)
-        if len(temp_line) > max_code_len_per_line:
-            line = pattern_func_name.sub(self.stack_class[-1] + x + '::\n' + r'\1(', line, count=1)
-        else:
-            line = temp_line
-        logging.info("line[%s]", line)
-        # add template as the above if there is one
-        template_line = re.sub(r'\s*=.*>(\s*)$', r'>\1', template_line)
-        template_line = re.sub(r'\s*=.*,', ',', template_line)
-        template_line = re.sub(r'\s*=.*', '', template_line)
-        line = template_line + template_string + line
-        func_name = re.search(r'^.*\)', line, re.MULTILINE | re.DOTALL).group()
-        logging.info("line[%s]", line)
-        logging.info("func_name[%s]", func_name)
-        return line, func_name
-
-    def write_func_content(self, content, func_name, need_generate):
-        if not (func_name in self.func_list_exist) and need_generate:
-            self.output_fd.write(content)
-            self.func_list_exist.append(func_name)
-            logging.info('add func:[%s]', func_name)
-
-    def gen_comment(self, start_i):
-        comment_line = ''
-        # Function comments are on top of function declarations, copy them over
-        k = start_i - 1  # one line before this func start
-        if pattern_template.search(self.input_content[k]):
-            k -= 1
-        if pattern_comment_2_end.search(self.input_content[k]):
-            comment_line = self.input_content[k].lstrip()
-            while not pattern_comment_2_start.search(self.input_content[k]):
-                k -= 1
-                comment_line = self.input_content[k].lstrip() + comment_line
-        else:
-            for j in range(k, 0, -1):
-                c_line = self.input_content[j]
-                if pattern_comment.search(c_line):
-                    c_line = re.sub(r'\s*//', '//', c_line)
-                    comment_line = c_line + comment_line
-                else:
-                    break
-        return comment_line
-
-    @staticmethod
-    def implement_function(func):
-        function_def = ''
-        function_def += '{\n'
-
-        all_items = func.split()
-        start = 0
-        return_type = all_items[start]
-        if return_type == "const":
-            start += 1
-            return_type = all_items[start]
-        if return_type.startswith(('std::map', 'std::set', 'std::vector')):
-            return_type = "std::map"
-        if return_type.endswith('*') or (len(all_items) > start + 1 and all_items[start + 1].startswith('*')):
-            return_type = "Ptr"
-        if len(all_items) > start + 1 and all_items[start + 1].startswith('&'):
-            return_type += "&"
-        if RETURN_STATEMENTS.__contains__(return_type):
-            function_def += RETURN_STATEMENTS[return_type]
-        else:
-            logging.warning("Unhandled return type[%s]", return_type)
-
-        function_def += '\n'
-        function_def += '}\n'
-        function_def += '\n'
-        return function_def
-
-
-def collect_header_files(path):
-    """
-    :param path:
-    :return:
-    """
-    header_files = []
-    shared_includes_content = []
-    for root, dirs, files in os.walk(path):
-        files.sort()
-        for file in files:
-            if file.find("git") >= 0:
-                continue
-            if not file.endswith('.h'):
-                continue
-            file_path = os.path.join(root, file)
-            file_path = file_path.replace('\\', '/')
-            header_files.append(file_path)
-            include_str = '#include "{}"\n'.format(file_path[path.rindex('/') + 1:])
-            shared_includes_content.append(include_str)
-    # for acl error code
-    shared_includes_content.append('#include <iostream>\n')
-    shared_includes_content.append('const int ACL_ERROR_COMPILING_STUB_MODE = 100039;\n')
-    return header_files, shared_includes_content
-
-
-def generate_stub_file(inc_dir, out_cc_dir):
-    """
-    :param inc_dir:
-    :param out_cc_dir:
-    :return:
-    """
-    target_header_files, shared_includes_content = collect_header_files(inc_dir)
-    for header_file in target_header_files:
-        if not file_endswith_white_list_suffix(header_file):
-            continue
-        cc_file = re.sub('.h*$', '.cc', header_file)
-        h_2_cc = H2CC(header_file, out_cc_dir + cc_file[cc_file.rindex('/') + 1:], shared_includes_content)
-        h_2_cc.h2cc()
-
-
-def gen_code(inc_dir, out_cc_dir):
-    """
-    :param inc_dir:
-    :param out_cc_dir:
-    :return:
-    """
-    if not inc_dir.endswith('/'):
-        inc_dir += '/'
-    if not out_cc_dir.endswith('/'):
-        out_cc_dir += '/'
-    for include_dir_key_word in include_dir_key_words:
-        generate_stub_file(inc_dir + include_dir_key_word, out_cc_dir)
-
-
-if __name__ == '__main__':
-    inc_dir = sys.argv[1]
-    out_cc_dir = sys.argv[2]
-    gen_code(inc_dir, out_cc_dir)
diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h
index 00c220f1..f543fa4d 100644
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -61,6 +61,16 @@ struct model_feature {
     float *gradient_time;    /**< The BP compution time of each gradient */
 };
 
+/**
+ * @brief Memory Register Address Struct for Remote Access
+ */
+struct MemRegisterAddr {
+    u64 addr;
+    u64 length;
+};
+
+const u32 HCCL_MAX_MEM_REGISTER_NUM = 1024 * 1024;   // The max number of memory register address is 1M (1024 * 1024).
+
 enum GradSplitForceMode {
     FORCE_NONE,     /**< no force */
     FORCE_SIZE,     /**< force split gradient by size */
diff --git a/third_party/fwkacllib/inc/hccl/hccl_types.h b/third_party/fwkacllib/inc/hccl/hccl_types.h
deleted file mode 100644
index 276516e7..00000000
--- a/third_party/fwkacllib/inc/hccl/hccl_types.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hccl_types.h
- * @brief HCCL data type definition 
- * 
- */
- 
-#ifndef HCCL_TYPES_H_
-#define HCCL_TYPES_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/**
- * @brief HCCL functions return value definition
- */
-typedef enum {
-    HCCL_SUCCESS = 0,               /**< success */
-    HCCL_E_PARA = 1,                /**< parameter error */
-    HCCL_E_PTR = 2,                 /**< empty pointer */
-    HCCL_E_MEMORY = 3,              /**< memory error */
-    HCCL_E_INTERNAL = 4,            /**< internal error */
-    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
-    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
-    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
-    HCCL_E_SYSCALL = 8,             /**< call system interface error */
-    HCCL_E_TIMEOUT = 9,             /**< timeout */
-    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
-    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
-    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
-    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
-    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
-    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
-    HCCL_E_DRV = 16,                /**< call driver api fail */
-    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
-    HCCL_E_CCE = 18,                /**< call cce api fail */
-    HCCL_E_NETWORK = 19,            /**< call network api fail */
-    HCCL_E_RESERVED                 /**< reserved */
-} HcclResult;
-
-/**
- * @brief handle to HCCL communicator
- */
-typedef void *HcclComm;
-
-/**
- * @brief HCCL Reduction opperation
- */
-typedef enum {
-    HCCL_REDUCE_SUM = 0,    /**< sum */
-    HCCL_REDUCE_PROD = 1,   /**< prod */
-    HCCL_REDUCE_MAX = 2,    /**< max */
-    HCCL_REDUCE_MIN = 3,    /**< min */
-    HCCL_REDUCE_RESERVED    /**< reserved */
-} HcclReduceOp;
-
-/**
- * @brief HCCL data type
- */
-typedef enum {
-    HCCL_DATA_TYPE_INT8 = 0,    /**< int8 */
-    HCCL_DATA_TYPE_INT16 = 1,   /**< int16 */
-    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
-    HCCL_DATA_TYPE_FP16 = 3,    /**< fp16 */
-    HCCL_DATA_TYPE_FP32 = 4,    /**< fp32 */
-    HCCL_DATA_TYPE_RESERVED     /**< reserved */
-} HcclDataType;
-
-const uint32_t HCCL_ROOT_INFO_BYTES =  4108; // 4108: root info length
-
-/**
- * @brief HCCL root info
- */
-typedef struct HcclRootInfoDef {
-    char internal[HCCL_ROOT_INFO_BYTES];
-} HcclRootInfo;
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-#endif // HCCL_TYPES_H_
diff --git a/third_party/fwkacllib/inc/hccl/hcom.h b/third_party/fwkacllib/inc/hccl/hcom.h
index 4399d3a8..90b96ac7 100644
--- a/third_party/fwkacllib/inc/hccl/hcom.h
+++ b/third_party/fwkacllib/inc/hccl/hcom.h
@@ -270,6 +270,15 @@ extern HcclResult hcom_set_split_strategy_by_index(const char *group, u32 segmen
  */
 extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList);
 
+/**
+ * @brief Register memories and init resources for remote access.
+ *
+ * @param addrList memory addresses for remote access.
+ * @param count number of remote memory addresses.
+ * @return HcclResult
+ */
+extern HcclResult hcom_remote_access_mem_register(const MemRegisterAddr* addrList, u32 count);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/third_party/fwkacllib/inc/mmpa/mmpa_api.h b/third_party/fwkacllib/inc/mmpa/mmpa_api.h
index ce1c9720..a7f13636 100644
--- a/third_party/fwkacllib/inc/mmpa/mmpa_api.h
+++ b/third_party/fwkacllib/inc/mmpa/mmpa_api.h
@@ -17,10 +17,10 @@
 #ifndef _MMPA_API_H_
 #define _MMPA_API_H_
 
-#define  LINUX    0
-#define  WIN      1
+#define LINUX 0
+#define WIN 1
 
-#if(OS_TYPE == LINUX) //lint !e553
+#if(OS_TYPE == LINUX)
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
@@ -75,6 +75,7 @@
 #include <sys/wait.h>
 #include <sys/statvfs.h>
 #include <sys/prctl.h>
+#include <sys/inotify.h>
 
 #include "securec.h"
 
@@ -84,7 +85,7 @@
 #endif
 
 
-#if(OS_TYPE == WIN) //lint !e553
+#if(OS_TYPE == WIN)
 #include <winsock2.h>
 #include <winsock.h>
 #include "Windows.h"
@@ -103,16 +104,19 @@
 #include <stdarg.h>
 #include "shlwapi.h"
 #include <direct.h>
-#include "sub_inc/mmpa_typedef_win.h"
-#include "sub_inc/mmpa_win.h"
 #include <VersionHelpers.h>
 #include <processthreadsapi.h>
 #include <Wbemidl.h>
 #include <iphlpapi.h>
-
+#include <synchapi.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include "securec.h"
+
+#include "sub_inc/mmpa_typedef_win.h"
+#include "sub_inc/mmpa_win.h"
+
 #pragma comment(lib, "ws2_32.lib")
 #pragma comment(lib, "mswsock.lib")
 #pragma comment(lib, "Kernel32.lib")
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
index 6ac8f8f6..aced4968 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
@@ -30,18 +30,26 @@ typedef pthread_t mmThread;
 typedef pthread_mutex_t mmMutex_t;
 typedef pthread_cond_t mmCond;
 typedef pthread_mutex_t mmMutexFC;
+typedef pthread_rwlock_t mmRWLock_t;
 typedef signed int mmProcess;
 typedef int mmPollHandle;
 typedef int mmPipeHandle;
+typedef int mmFileHandle;
 typedef int mmComPletionKey;
 typedef int mmCompletionHandle;
+typedef int mmErrorMsg;
+typedef int mmFd_t;
 
 typedef VOID *mmExitCode;
 typedef key_t mmKey_t;
 typedef int mmMsgid;
 typedef struct dirent mmDirent;
+typedef struct shmid_ds mmshmId_ds;
 typedef int (*mmFilter)(const mmDirent *entry);
 typedef int (*mmSort)(const mmDirent **a, const mmDirent **b);
+typedef size_t mmSize_t;
+typedef off_t mmOfft_t;
+typedef pid_t mmPid_t;
 
 typedef VOID *(*userProcFunc)(VOID *pulArg);
 
@@ -50,6 +58,16 @@ typedef struct {
   VOID *pulArg;           // Callback function parameters
 } mmUserBlock_t;
 
+typedef struct {
+  const char *dli_fname;
+  void *dli_fbase;
+  const char *dli_sname;
+  void *dli_saddr;
+  size_t dli_size; /* ELF only */
+  int dli_bind; /* ELF only */
+  int dli_type;
+} mmDlInfo;
+
 typedef struct {
   int wSecond;             // Seconds. [0-60] (1 leap second)
   int wMinute;             // Minutes. [0-59]
@@ -73,6 +91,7 @@ typedef pthread_key_t mmThreadKey;
 typedef int mmOverLap;
 
 typedef ssize_t mmSsize_t;
+typedef size_t mmSize; // size
 
 typedef struct {
   UINT32 createFlag;
@@ -201,6 +220,17 @@ typedef struct {
 #define M_RDWR O_RDWR
 #define M_CREAT O_CREAT
 #define M_BINARY O_RDONLY
+#define M_TRUNC O_TRUNC
+#define M_IRWXU S_IRWXU
+
+#define M_IN_CREATE IN_CREATE
+#define M_IN_CLOSE_WRITE IN_CLOSE_WRITE
+#define M_IN_IGNORED IN_IGNORED
+
+#define M_OUT_CREATE IN_CREATE
+#define M_OUT_CLOSE_WRITE IN_CLOSE_WRITE
+#define M_OUT_IGNORED IN_IGNORED
+#define M_OUT_ISDIR IN_ISDIR
 
 #define M_IREAD S_IREAD
 #define M_IRUSR S_IRUSR
@@ -236,13 +266,20 @@ typedef struct {
 #define MMPA_OPTIONAL_ARGUMENT 2
 
 #define MMPA_MAX_PATH PATH_MAX
+#define M_NAME_MAX MAX_FNAME
 
 #define M_F_OK F_OK
 #define M_R_OK R_OK
 #define M_W_OK W_OK
 
+#define MMPA_STDIN STDIN_FILENO
+#define MMPA_STDOUT STDOUT_FILENO
+#define MMPA_STDERR STDERR_FILENO
+
 #define MMPA_RTLD_NOW RTLD_NOW
 #define MMPA_RTLD_GLOBAL RTLD_GLOBAL
+#define MMPA_RTLD_LAZY RTLD_LAZY
+#define MMPA_RTLD_NODELETE RTLD_NODELETE
 
 #define MMPA_DL_EXT_NAME ".so"
 
@@ -250,6 +287,7 @@ extern INT32 mmCreateTask(mmThread *threadHandle, mmUserBlock_t *funcBlock);
 extern INT32 mmJoinTask(mmThread *threadHandle);
 extern INT32 mmMutexInit(mmMutex_t *mutex);
 extern INT32 mmMutexLock(mmMutex_t *mutex);
+extern INT32 mmMutexTryLock(mmMutex_t *mutex);
 extern INT32 mmMutexUnLock(mmMutex_t *mutex);
 extern INT32 mmMutexDestroy(mmMutex_t *mutex);
 extern INT32 mmCondInit(mmCond *cond);
@@ -257,6 +295,14 @@ extern INT32 mmCondLockInit(mmMutexFC *mutex);
 extern INT32 mmCondLock(mmMutexFC *mutex);
 extern INT32 mmCondUnLock(mmMutexFC *mutex);
 extern INT32 mmCondLockDestroy(mmMutexFC *mutex);
+extern INT32 mmRWLockInit(mmRWLock_t *rwLock);
+extern INT32 mmRWLockRDLock(mmRWLock_t *rwLock);
+extern INT32 mmRWLockTryRDLock(mmRWLock_t *rwLock);
+extern INT32 mmRWLockWRLock(mmRWLock_t *rwLock);
+extern INT32 mmRWLockTryWRLock(mmRWLock_t *rwLock);
+extern INT32 mmRDLockUnLock(mmRWLock_t *rwLock);
+extern INT32 mmWRLockUnLock(mmRWLock_t *rwLock);
+extern INT32 mmRWLockDestroy(mmRWLock_t *rwLock);
 extern INT32 mmCondWait(mmCond *cond, mmMutexFC *mutex);
 extern INT32 mmCondTimedWait(mmCond *cond, mmMutexFC *mutex, UINT32 milliSecond);
 extern INT32 mmCondNotify(mmCond *cond);
@@ -266,6 +312,7 @@ extern INT32 mmGetPid();
 extern INT32 mmGetTid();
 extern INT32 mmGetPidHandle(mmProcess *processHandle);
 extern INT32 mmGetLocalTime(mmSystemTime_t *sysTime);
+extern INT32 mmGetSystemTime(mmSystemTime_t *sysTime);
 
 extern INT32 mmSemInit(mmSem_t *sem, UINT32 value);
 extern INT32 mmSemWait(mmSem_t *sem);
@@ -273,7 +320,9 @@ extern INT32 mmSemPost(mmSem_t *sem);
 extern INT32 mmSemDestroy(mmSem_t *sem);
 extern INT32 mmOpen(const CHAR *pathName, INT32 flags);
 extern INT32 mmOpen2(const CHAR *pathName, INT32 flags, MODE mode);
+extern FILE *mmPopen(CHAR *command, CHAR *type);
 extern INT32 mmClose(INT32 fd);
+extern INT32 mmPclose(FILE *stream);
 extern mmSsize_t mmWrite(INT32 fd, VOID *buf, UINT32 bufLen);
 extern mmSsize_t mmRead(INT32 fd, VOID *buf, UINT32 bufLen);
 extern mmSockHandle mmSocket(INT32 sockFamily, INT32 type, INT32 protocol);
@@ -284,9 +333,22 @@ extern INT32 mmConnect(mmSockHandle sockFd, mmSockAddr *addr, mmSocklen_t addrLe
 extern INT32 mmCloseSocket(mmSockHandle sockFd);
 extern mmSsize_t mmSocketSend(mmSockHandle sockFd, VOID *sendBuf, INT32 sendLen, INT32 sendFlag);
 extern mmSsize_t mmSocketRecv(mmSockHandle sockFd, VOID *recvBuf, INT32 recvLen, INT32 recvFlag);
+extern INT32 mmSocketSendTo(mmSockHandle sockFd,
+                            VOID *sendMsg,
+                            INT32 sendLen,
+                            UINT32 sendFlag,
+                            const mmSockAddr* addr,
+                            INT32 tolen);
+extern mmSsize_t mmSocketRecvFrom(mmSockHandle sockFd,
+                                  VOID *recvBuf,
+                                  mmSize recvLen,
+                                  UINT32 recvFlag,
+                                  mmSockAddr* addr,
+                                  mmSocklen_t *FromLen);
 extern INT32 mmSAStartup();
 extern INT32 mmSACleanup();
 extern VOID *mmDlopen(const CHAR *fileName, INT32 mode);
+extern INT32 mmDladdr(VOID *addr, mmDlInfo *info);
 extern VOID *mmDlsym(VOID *handle, CHAR *funcName);
 extern INT32 mmDlclose(VOID *handle);
 extern CHAR *mmDlerror();
@@ -294,6 +356,7 @@ extern INT32 mmCreateAndSetTimer(mmTimer *timerHandle, mmUserBlock_t *timerBlock
 extern INT32 mmDeleteTimer(mmTimer timerHandle);
 extern INT32 mmStatGet(const CHAR *path, mmStat_t *buffer);
 extern INT32 mmStat64Get(const CHAR *path, mmStat64_t *buffer);
+extern INT32 mmFStatGet(INT32 fd, mmStat_t *buffer);
 extern INT32 mmMkdir(const CHAR *pathName, mmMode_t mode);
 extern INT32 mmSleep(UINT32 milliSecond);
 
@@ -337,6 +400,7 @@ extern VOID mmCloseCompletionPort(mmCompletionHandle handle);
 extern INT32 mmPoll(mmPollfd *fds, INT32 fdCount, INT32 timeout, mmCompletionHandle handleIOCP, pmmPollData polledData,
                     mmPollBack pollBack);
 extern INT32 mmGetErrorCode();
+extern CHAR *mmGetErrorFormatMessage(mmErrorMsg errnum, CHAR *buf, mmSize size);
 extern INT32 mmGetTimeOfDay(mmTimeval *timeVal, mmTimezone *timeZone);
 extern mmTimespec mmGetTickCount();
 extern INT32 mmGetRealPath(CHAR *path, CHAR *realPath);
@@ -382,6 +446,7 @@ extern INT32 mmTlsDelete(mmThreadKey key);
 extern INT32 mmGetOsType();
 
 extern INT32 mmFsync(mmProcess fd);
+extern INT32 mmFsync2(INT32 fd);
 extern INT32 mmChdir(const CHAR *path);
 extern INT32 mmUmask(INT32 pmode);
 extern INT32 mmThreadKill(mmThread id);
@@ -439,6 +504,10 @@ extern INT32 mmCreateProcess(const CHAR *fileName, const mmArgvEnv *env, const c
 
 extern INT32 mmCreateTaskWithThreadAttr(mmThread *threadHandle, const mmUserBlock_t *funcBlock,
                                         const mmThreadAttr *threadAttr);
+extern mmFileHandle mmShmOpen(const CHAR *name, INT32 oflag, mmMode_t mode);
+extern INT32 mmShmUnlink(const CHAR *name);
+extern VOID *mmMmap(mmFd_t fd, mmSize_t size, mmOfft_t offset, mmFd_t *extra, INT32 prot, INT32 flags);
+extern INT32 mmMunMap(VOID *data, mmSize_t size, mmFd_t *extra);
 #define MMPA_DLL_API
 
 #ifdef __cplusplus
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h
index fc862a72..9df5b9ce 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h
@@ -79,6 +79,8 @@ typedef long LONG;
 #define MMPA_THREAD_SCHED_OTHER SCHED_OTHER
 #define MMPA_THREAD_MIN_STACK_SIZE PTHREAD_STACK_MIN
 
+#define MM_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+
 #define MMPA_MAX_NI 19
 #define MMPA_MIN_NI (-20)
 
@@ -86,6 +88,7 @@ typedef long LONG;
 #define EN_ERR 1
 #define EN_ERROR (-1)
 #define EN_INVALID_PARAM (-2)
+#define EN_TIMEOUT (-3)
 
 #ifdef __cplusplus
 #if __cplusplus
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h
index fc1b4858..1627d7a9 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h
@@ -35,6 +35,7 @@ extern "C" {
 #define EN_ERR 1
 #define EN_ERROR (-1)
 #define EN_INVALID_PARAM (-2)
+#define EN_TIMEOUT (-3)
 
 #define HANDLE_INVALID_VALUE (-1)
 #define INVALID_SOCKET_HANDLE INVALID_SOCKET
@@ -60,6 +61,7 @@ extern "C" {
 #define MMPA_MIDDLE_NI 5
 #define MMPA_LOW_NI (-5)
 #define MMPA_MIN_NI (-20)
+#define MMPA_MAX_FILE 128
 
 #define MMPA_MAX_THREAD_PIO 99
 #define MMPA_MIDDLE_THREAD_PIO 66
@@ -71,6 +73,8 @@ extern "C" {
 #define MMPA_THREAD_SCHED_OTHER 0
 #define MMPA_THREAD_MIN_STACK_SIZE 0
 
+#define MM_MUTEX_INITIALIZER NULL
+
 #ifdef __cplusplus
 #if __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
index 68a70c27..be8e2bf3 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
@@ -43,8 +43,9 @@ typedef HANDLE mmThread;
 typedef HANDLE mmProcess;
 typedef HANDLE mmPollHandle;
 typedef HANDLE mmPipeHandle;
+typedef HANDLE mmFileHandle;
 typedef HANDLE mmCompletionHandle;
-
+typedef HANDLE mmFd_t;
 typedef CRITICAL_SECTION mmMutexFC;
 typedef CONDITION_VARIABLE mmCond;
 
@@ -59,15 +60,22 @@ typedef SYSTEMTIME mmSystemTime_t;
 
 typedef HANDLE mmSem_t;
 typedef SOCKET mmSockHandle;
+typedef SRWLOCK mmRWLock_t;
 typedef struct sockaddr mmSockAddr;
 typedef int mmSocklen_t;
 typedef int mmSemTimeout_t;
 typedef long mmAtomicType;
 typedef DWORD mmExitCode;
+typedef DWORD  mmErrorMsg;
 typedef int mmKey_t;
 typedef HANDLE mmMsgid;
+typedef long int mmOfft_t;
+typedef int mmPid_t;
 
 typedef INT32 mmSsize_t;
+typedef int mmSize; // size
+typedef size_t mmSize_t;
+typedef VOID mmshmId_ds;
 
 typedef enum {
   DT_DIR = FILE_ATTRIBUTE_DIRECTORY,
@@ -181,6 +189,16 @@ typedef struct {
   ULONGLONG availSize;
 } mmDiskSize;
 
+typedef struct {
+  const char *dli_fname;
+  void *dli_fbase;
+  const char *dli_sname;
+  void *dli_saddr;
+  size_t dli_size; /* ELF only */
+  int dli_bind; /* ELF only */
+  int dli_type;
+} mmDlInfo;
+
 typedef struct {
   char addr[MMPA_MACINFO_DEFAULT_SIZE];  // ex:aa-bb-cc-dd-ee-ff\0
 } mmMacInfo;
@@ -223,8 +241,10 @@ typedef VOID (*mmPf)(VOID);
 #define M_RDONLY _O_RDONLY
 #define M_WRONLY _O_WRONLY
 #define M_RDWR _O_RDWR
+#define M_IRWXU _O_RDWR
 #define M_CREAT _O_CREAT
 #define M_BINARY _O_BINARY
+#define M_TRUNC _O_TRUNC
 
 #define M_IREAD _S_IREAD
 #define M_IRUSR _S_IREAD
@@ -232,6 +252,15 @@ typedef VOID (*mmPf)(VOID);
 #define M_IWUSR _S_IWRITE
 #define M_IXUSR 0
 
+#define M_IN_CREATE FILE_NOTIFY_CHANGE_FILE_NAME | FILE_NOTIFY_CHANGE_DIR_NAME
+#define M_IN_CLOSE_WRITE FILE_NOTIFY_CHANGE_LAST_WRITE
+#define M_IN_IGNORED FILE_NOTIFY_CHANGE_FILE_NAME | FILE_NOTIFY_CHANGE_DIR_NAME
+
+#define M_OUT_CREATE 0x00000100
+#define M_OUT_CLOSE_WRITE 0x00000008
+#define M_OUT_IGNORED 0x00008000
+#define M_OUT_ISDIR 0x40000000
+
 #define M_MSG_CREAT 1
 #define M_MSG_EXCL 2
 #define M_MSG_NOWAIT 3
@@ -251,6 +280,16 @@ typedef VOID (*mmPf)(VOID);
 #define M_UMASK_GRPEXEC 0
 #define M_UMASK_OTHEXEC 0
 
+#define DT_UNKNOWN 0
+#define DT_FIFO 1
+#define DT_CHR 2
+#define DT_DIR 4
+#define DT_BLK 6
+#define DT_REG 8
+#define DT_LNK 10
+#define DT_SOCK 12
+#define DT_WHT 14
+
 #define mmConstructor(x) __declspec(allocate(".CRT$XCU")) mmPf con = x
 #define mmDestructor(x) __declspec(allocate(".CRT$XPU")) mmPf de = x
 
@@ -269,13 +308,20 @@ typedef VOID (*mmPf)(VOID);
 
 #define MMPA_EMSG ""
 #define MMPA_MAX_PATH MAX_PATH
+#define M_NAME_MAX  _MAX_FNAME
 
 #define M_F_OK 0
 #define M_W_OK 2
 #define M_R_OK 4
 
+#define MMPA_STDIN stdin
+#define MMPA_STDOUT stdout
+#define MMPA_STDERR stderr
+
 #define MMPA_RTLD_NOW 0
 #define MMPA_RTLD_GLOBAL 0
+#define MMPA_RTLD_LAZY 0
+#define MMPA_RTLD_NODELETE 0
 
 #define MMPA_DL_EXT_NAME ".dll"
 
@@ -285,6 +331,7 @@ _declspec(dllexport) INT32 mmCreateTask(mmThread *threadHandle, mmUserBlock_t *f
 _declspec(dllexport) INT32 mmJoinTask(mmThread *threadHandle);
 _declspec(dllexport) INT32 mmMutexInit(mmMutex_t *mutex);
 _declspec(dllexport) INT32 mmMutexLock(mmMutex_t *mutex);
+_declspec(dllexport) INT32 mmMutexTryLock(mmMutex_t *mutex);
 _declspec(dllexport) INT32 mmMutexUnLock(mmMutex_t *mutex);
 _declspec(dllexport) INT32 mmMutexDestroy(mmMutex_t *mutex);
 _declspec(dllexport) INT32 mmCondInit(mmCond *cond);
@@ -292,6 +339,14 @@ _declspec(dllexport) INT32 mmCondLockInit(mmMutexFC *mutex);
 _declspec(dllexport) INT32 mmCondLock(mmMutexFC *mutex);
 _declspec(dllexport) INT32 mmCondUnLock(mmMutexFC *mutex);
 _declspec(dllexport) INT32 mmCondLockDestroy(mmMutexFC *mutex);
+_declspec(dllexport) INT32 mmRWLockInit(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmRWLockRDLock(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmRWLockTryRDLock(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmRWLockWRLock(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmRWLockTryWRLock(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmRDLockUnLock(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmWRLockUnLock(mmRWLock_t *rwLock);
+_declspec(dllexport) INT32 mmRWLockDestroy(mmRWLock_t *rwLock);
 _declspec(dllexport) INT32 mmCondWait(mmCond *cond, mmMutexFC *mutex);
 _declspec(dllexport) INT32 mmCondTimedWait(mmCond *cond, mmMutexFC *mutex, UINT32 milliSecond);
 
@@ -302,13 +357,16 @@ _declspec(dllexport) INT32 mmGetPid(VOID);
 _declspec(dllexport) INT32 mmGetTid(VOID);
 _declspec(dllexport) INT32 mmGetPidHandle(mmProcess *processHandle);
 _declspec(dllexport) INT32 mmGetLocalTime(mmSystemTime_t *sysTime);
+_declspec(dllexport) INT32 mmGetSystemTime(mmSystemTime_t *sysTime);
 _declspec(dllexport) INT32 mmSemInit(mmSem_t *sem, UINT32 value);
 _declspec(dllexport) INT32 mmSemWait(mmSem_t *sem);
 _declspec(dllexport) INT32 mmSemPost(mmSem_t *sem);
 _declspec(dllexport) INT32 mmSemDestroy(mmSem_t *sem);
 _declspec(dllexport) INT32 mmOpen(const CHAR *pathName, INT32 flags);
 _declspec(dllexport) INT32 mmOpen2(const CHAR *pathName, INT32 flags, MODE mode);
+_declspec(dllexport) FILE *mmPopen(CHAR *command, CHAR *type);
 _declspec(dllexport) INT32 mmClose(INT32 fd);
+_declspec(dllexport) INT32 mmPclose(FILE *stream);
 _declspec(dllexport) mmSsize_t mmWrite(INT32 fd, VOID *buf, UINT32 bufLen);
 _declspec(dllexport) mmSsize_t mmRead(INT32 fd, VOID *buf, UINT32 bufLen);
 _declspec(dllexport) mmSockHandle mmSocket(INT32 sockFamily, INT32 type, INT32 protocol);
@@ -319,9 +377,22 @@ _declspec(dllexport) INT32 mmConnect(mmSockHandle sockFd, mmSockAddr *addr, mmSo
 _declspec(dllexport) INT32 mmCloseSocket(mmSockHandle sockFd);
 _declspec(dllexport) mmSsize_t mmSocketRecv(mmSockHandle sockFd, VOID *recvBuf, INT32 recvLen, INT32 recvFlag);
 _declspec(dllexport) mmSsize_t mmSocketSend(mmSockHandle sockFd, VOID *sendBuf, INT32 sendLen, INT32 sendFlag);
+_declspec(dllexport) INT32 mmSocketSendTo(mmSockHandle sockFd,
+                                          VOID *sendMsg,
+                                          INT32 sendLen,
+                                          UINT32 sendFlag,
+                                          const mmSockAddr* addr,
+                                          INT32 tolen);
+_declspec(dllexport) mmSsize_t mmSocketRecvFrom(mmSockHandle sockFd,
+                                                VOID *recvBuf,
+                                                mmSize recvLen,
+                                                UINT32 recvFlag,
+                                                mmSockAddr* addr,
+                                                mmSocklen_t *FromLen);
 _declspec(dllexport) INT32 mmSAStartup(VOID);
 _declspec(dllexport) INT32 mmSACleanup(VOID);
 _declspec(dllexport) VOID *mmDlopen(const CHAR *fileName, INT mode);
+_declspec(dllexport) INT32 mmDladdr(VOID *addr, mmDlInfo *info);
 _declspec(dllexport) VOID *mmDlsym(VOID *handle, CHAR *fileName);
 _declspec(dllexport) INT32 mmDlclose(VOID *handle);
 _declspec(dllexport) CHAR *mmDlerror(VOID);
@@ -330,6 +401,7 @@ _declspec(dllexport) INT32
 _declspec(dllexport) INT32 mmDeleteTimer(mmTimer timerHandle);
 _declspec(dllexport) INT32 mmStatGet(const CHAR *path, mmStat_t *buffer);
 _declspec(dllexport) INT32 mmStat64Get(const CHAR *path, mmStat64_t *buffer);
+_declspec(dllexport) INT32 mmFStatGet(INT32 fd, mmStat_t *buffer);
 _declspec(dllexport) INT32 mmMkdir(const CHAR *pathName, mmMode_t mode);
 _declspec(dllexport) INT32 mmSleep(UINT32 milliSecond);
 _declspec(dllexport) INT32 mmCreateTaskWithAttr(mmThread *threadHandle, mmUserBlock_t *funcBlock);
@@ -371,6 +443,7 @@ _declspec(dllexport) INT32 mmPoll(mmPollfd *fds, INT32 fdCount, INT32 timeout, m
                                   pmmPollData polledData, mmPollBack pollBack);
 
 _declspec(dllexport) INT32 mmGetErrorCode();
+_declspec(dllexport) CHAR *mmGetErrorFormatMessage(mmErrorMsg errnum, CHAR *buf, mmSize size);
 _declspec(dllexport) INT32 mmGetTimeOfDay(mmTimeval *timeVal, mmTimezone *timeZone);
 _declspec(dllexport) mmTimespec mmGetTickCount();
 _declspec(dllexport) INT32 mmGetRealPath(CHAR *path, CHAR *realPath);
@@ -407,7 +480,7 @@ _declspec(dllexport) INT32 mmTlsDelete(mmThreadKey key);
 _declspec(dllexport) INT32 mmGetOsType();
 
 _declspec(dllexport) INT32 mmFsync(mmProcess fd);
-
+_declspec(dllexport) INT32 mmFsync2(INT32 fd);
 _declspec(dllexport) INT32 mmChdir(const CHAR *path);
 _declspec(dllexport) INT32 mmUmask(INT32 pmode);
 _declspec(dllexport) INT32 mmWaitPid(mmProcess pid, INT32 *status, INT32 options);
@@ -455,7 +528,10 @@ _declspec(dllexport) INT32
 
 _declspec(dllexport) INT32
     mmCreateTaskWithThreadAttr(mmThread *threadHandle, const mmUserBlock_t *funcBlock, const mmThreadAttr *threadAttr);
-
+_declspec(dllexport) mmFileHandle mmShmOpen(const CHAR *name, INT32 oflag, mmMode_t mode);
+_declspec(dllexport) INT32 mmShmUnlink(const CHAR *name);
+_declspec(dllexport) VOID *mmMmap(mmFd_t fd, mmSize_t size, mmOfft_t offset, mmFd_t *extra, INT32 prot, INT32 flags);
+_declspec(dllexport) INT32 mmMunMap(VOID *data, mmSize_t size, mmFd_t *extra);
 #ifdef __cplusplus
 #if __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/ops/aipp.h b/third_party/fwkacllib/inc/ops/aipp.h
index dd01ac5f..bed984bd 100644
--- a/third_party/fwkacllib/inc/ops/aipp.h
+++ b/third_party/fwkacllib/inc/ops/aipp.h
@@ -18,28 +18,23 @@
  * \file aipp.h
  * \brief
  */
-#ifndef GE_OP_AIPP_H
-#define GE_OP_AIPP_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_AIPP_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_AIPP_H_
 
 #include "graph/operator_reg.h"
 
 namespace ge {
 /**
-*@brief Performs AI pre-processing (AIPP) on images including color space 
-conversion (CSC),
-image normalization (by subtracting the mean value or multiplying a factor), 
-image cropping
-(by specifying the crop start and cropping the image to the size required by 
-the neural network), and much more. \n
+*@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC),
+image normalization (by subtracting the mean value or multiplying a factor), image cropping
+(by specifying the crop start and cropping the image to the size required by the neural network), and much more. \n
 
 *@par Inputs:
-*@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the 
-data layer.
+*@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer.
 *@li params: Dynamic AIPP configuration parameters of type uint8. \n
 
 *@par Attributes:
-*aipp_config_path: A required string, specifying the path of the AIPP 
-configuration file. \n
+*aipp_config_path: A required string, specifying the path of the AIPP configuration file. \n
 
 *@par Outputs:
 *features: The AIPP-processed output tensor of type float16 or uint8.
@@ -78,4 +73,4 @@ REG_OP(AippData)
     .OP_END_FACTORY_REG(AippData)
 } // namespace ge
 
-#endif // GE_OP_AIPP_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_AIPP_H_
diff --git a/third_party/fwkacllib/inc/ops/all_ops.h b/third_party/fwkacllib/inc/ops/all_ops.h
index 84ff3d08..1ac83783 100644
--- a/third_party/fwkacllib/inc/ops/all_ops.h
+++ b/third_party/fwkacllib/inc/ops/all_ops.h
@@ -18,8 +18,8 @@
  * \file all_ops.h
  * \brief
  */
-#ifndef BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
-#define BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
 
 #include "aipp.h"
 #include "array_ops.h"
@@ -76,4 +76,4 @@
 #include "transformation_ops.h"
 #include "condtake_ops.h"
 #include "warp_perspective_ops.h"
-#endif  // BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_ALL_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h
index 1af02b05..e1f64421 100644
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -18,8 +18,8 @@
  * \file array_ops.h
  * \brief
  */
-#ifndef GE_OP_ARRAY_OPS_H_
-#define GE_OP_ARRAY_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_ARRAY_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_ARRAY_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -659,8 +659,7 @@ REG_OP(IdentityN)
     .OP_END_FACTORY_REG(IdentityN)
 
 /**
-*@brief Inserts a dimension of 1 into a tensor's shape. Only the tensor shape is changed, without
- changing the data. \n
+*@brief Inserts a dimension of 1 into a tensor's shape. Only the tensor shape is changed, without changing the data. \n
 
 *@par Inputs:
 *@li x: A tensor.
@@ -738,8 +737,7 @@ REG_OP(Reshape)
 *x: A tensor. \n
 
 *@par Attributes:
-*axis: An optional list of int32 or int64. If not specified, squeezes all dimensions of size 1.  
-If specified, only squeezes the dimensions listed. It is an error to squeeze a dimension that is not 1. \n
+*axis: An optional list of int32 or int64. If not specified, squeezes all dimensions of size 1.   If specified, only squeezes the dimensions listed. It is an error to squeeze a dimension that is not 1. \n
 
 *@par Outputs:
 *y: A tensor. \n
@@ -754,8 +752,7 @@ REG_OP(Squeeze)
     .OP_END_FACTORY_REG(Squeeze)
 
 /**
-*@brief Returns an integer representing the rank of input tensor. The rank of a tensor is the number of 
-indices required to uniquely select each element of the tensor, that is, the dimension size of the tensor. \n
+*@brief Returns an integer representing the rank of input tensor. The rank of a tensor is the number of indices required to uniquely select each element of the tensor, that is, the dimension size of the tensor. \n
 
 *@par Inputs:
 *x: A tensor. \n
@@ -889,14 +886,29 @@ REG_OP(ReadVariableOp)
     .ATTR(dtype, Int, DT_INT32)
     .OP_END_FACTORY_REG(ReadVariableOp)
 
+/**
+*@brief Mark outputs of one sub graph which partitioned by engine type.
+
+*@par Inputs:
+*x: A tensor. \n
+
+*@par Outputs:
+*y: A tensor. \n
+
+*@par Attributes:
+*@li peerIndex: The index of the corresponding 'placeholder' node it's connected to.
+*@li parentOpType: Op type of original node.
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
 REG_OP(End)
     .INPUT(x, TensorType::ALL())
     .OUTPUT(y, TensorType::ALL())
-    .ATTR(peerIndex, Int, 0) // the index of the corresponding 'placeholder' node it's connected to
-    .ATTR(parentOpType, String, "") // op type of original node
+    .ATTR(peerIndex, Int, 0)
+    .ATTR(parentOpType, String, "")
     .OP_END_FACTORY_REG(End)
 
-
 /**
 *@brief Operations for writing summary data, for use in analysis and visualization.
 
@@ -964,8 +976,7 @@ REG_OP(ShapeN)
 
 *@par Attributes:
 *@li dtype: Optional. The data type of the output tensor. Defaults to "int32".
-*@li init: An optional bool. If true, initializes the returned tensor with the default value of "dtype". 
-Defaults to "false". \n
+*@li init: An optional bool. If true, initializes the returned tensor with the default value of "dtype". Defaults to "false". \n
 
 *@par Outputs:
 *y: A tensor. \n
@@ -1144,4 +1155,4 @@ REG_OP(EditDistance)
 
 }  // namespace ge
 
-#endif  // GE_OP_ARRAY_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_ARRAY_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/audio_ops.h b/third_party/fwkacllib/inc/ops/audio_ops.h
index 149c57d5..d9883253 100644
--- a/third_party/fwkacllib/inc/ops/audio_ops.h
+++ b/third_party/fwkacllib/inc/ops/audio_ops.h
@@ -18,8 +18,8 @@
  * \file audio_ops.h
  * \brief
  */
-#ifndef GE_OP_AUDIO_OPS_H_
-#define GE_OP_AUDIO_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_AUDIO_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_AUDIO_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -159,4 +159,4 @@ REG_OP(EncodeWav)
     .OP_END_FACTORY_REG(EncodeWav)
 }   // namespace ge
 
-#endif  // GE_OP_AUDIO_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_AUDIO_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/batch_ops.h b/third_party/fwkacllib/inc/ops/batch_ops.h
index 0e1562c0..8a1c5a7b 100644
--- a/third_party/fwkacllib/inc/ops/batch_ops.h
+++ b/third_party/fwkacllib/inc/ops/batch_ops.h
@@ -18,8 +18,8 @@
  * \file batch_ops.h
  * \brief
  */
-#ifndef GE_OP_BATCH_OPS_H_
-#define GE_OP_BATCH_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_BATCH_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_BATCH_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -158,4 +158,4 @@ REG_OP(UnbatchGrad)
   .OP_END_FACTORY_REG(UnbatchGrad)
 }  // namespace ge
 
-#endif  // GE_OP_BATCH_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_BATCH_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/bitwise_ops.h b/third_party/fwkacllib/inc/ops/bitwise_ops.h
index 5b35a38a..5c83e161 100644
--- a/third_party/fwkacllib/inc/ops/bitwise_ops.h
+++ b/third_party/fwkacllib/inc/ops/bitwise_ops.h
@@ -18,8 +18,8 @@
  * \file bitwise_ops.h
  * \brief
  */
-#ifndef GE_OP_BITWISE_OPS_H_
-#define GE_OP_BITWISE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_BITWISE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_BITWISE_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -56,4 +56,4 @@ REG_OP(RightShift)
 
 }  // namespace ge
 
-#endif  // GE_OP_BITWISE_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_BITWISE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
index f1b4e7a9..550e8b7d 100644
--- a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
+++ b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
@@ -18,8 +18,8 @@
  * \file boosted_trees_ops.h
  * \brief
  */
-#ifndef GE_OP_BOOSTED_TREES_OPS_H_
-#define GE_OP_BOOSTED_TREES_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_BOOSTED_TREES_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_BOOSTED_TREES_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -61,4 +61,4 @@ REG_OP(BoostedTreesBucketize)
 
 }  // namespace ge
 
-#endif  // GE_OP_BOOSTED_TREES_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_BOOSTED_TREES_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
index 9b9ce314..e20607bf 100644
--- a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
+++ b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
@@ -18,8 +18,8 @@
  * \file candidate_sampling_ops.h
  * \brief
  */
-#ifndef GE_OP_CANDIDATE_SAMPLING_OPS_H_
-#define GE_OP_CANDIDATE_SAMPLING_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_CANDIDATE_SAMPLING_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_CANDIDATE_SAMPLING_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -412,4 +412,4 @@ REG_OP(ComputeAccidentalHits)
 
 }  // namespace ge
 
-#endif  // GE_OP_CANDIDATE_SAMPLING_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_CANDIDATE_SAMPLING_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/condtake_ops.h b/third_party/fwkacllib/inc/ops/condtake_ops.h
index 554c18f1..5e91eb07 100644
--- a/third_party/fwkacllib/inc/ops/condtake_ops.h
+++ b/third_party/fwkacllib/inc/ops/condtake_ops.h
@@ -18,8 +18,8 @@
  * \file condtake_ops.h
  * \brief
  */
-#ifndef GE_OP_CONDTAKE_OPS_H_
-#define GE_OP_CONDTAKE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_CONDTAKE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_CONDTAKE_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -56,4 +56,4 @@ REG_OP(CondTake)
     .OP_END_FACTORY_REG(CondTake)
 }  // namespace ge
 
-#endif  // GE_OP_ARRAY_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_CONDTAKE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/control_flow_ops.h b/third_party/fwkacllib/inc/ops/control_flow_ops.h
index e2fd4715..7196b14f 100644
--- a/third_party/fwkacllib/inc/ops/control_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/control_flow_ops.h
@@ -18,8 +18,8 @@
  * \file control_flow_ops.h
  * \brief
  */
-#ifndef GE_CONTROL_FLOW_OPS_H_
-#define GE_CONTROL_FLOW_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_CONTROL_FLOW_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_CONTROL_FLOW_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -404,4 +404,4 @@ REG_OP(MapIndex)
     .OP_END_FACTORY_REG(MapIndex)
 }  // namespace ge
 
-#endif  // GE_CONTROL_FLOW_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_CONTROL_FLOW_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/ctc_ops.h b/third_party/fwkacllib/inc/ops/ctc_ops.h
index 383568dc..2c75fd09 100644
--- a/third_party/fwkacllib/inc/ops/ctc_ops.h
+++ b/third_party/fwkacllib/inc/ops/ctc_ops.h
@@ -18,8 +18,8 @@
  * \file ctc_ops.h
  * \brief
  */
-#ifndef GE_OP_CTC_OPS_H
-#define GE_OP_CTC_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -139,4 +139,4 @@ REG_OP(CTCBeamSearchDecoder)
 
 }  // namespace ge
 
-#endif //GE_OP_CTC_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h
index 3bfcfe01..461b3617 100644
--- a/third_party/fwkacllib/inc/ops/data_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h
@@ -18,8 +18,8 @@
  * \file data_flow_ops.h
  * \brief
  */
-#ifndef GE_OP_DATA_FLOW_OPS_H_
-#define GE_OP_DATA_FLOW_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_DATA_FLOW_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_DATA_FLOW_OPS_H_
 
 #include <algorithm>
 #include "graph/operator_reg.h"
@@ -2242,4 +2242,4 @@ REG_OP(OutfeedEnqueueOp)
 
 }   // namespace ge
 
-#endif  // GE_OP_DATA_FLOW_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_DATA_FLOW_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
index 6d865399..536dea63 100644
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -18,8 +18,8 @@
  * \file elewise_calculation_ops.h
  * \brief
  */
-#ifndef GE_OP_ELEWISE_CALCULATION_OPS_H
-#define GE_OP_ELEWISE_CALCULATION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_ELEWISE_CALCULATION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_ELEWISE_CALCULATION_OPS_H_
 #include "graph/operator_reg.h"
 
 namespace ge {
@@ -28,10 +28,9 @@ namespace ge {
 
 *@par Inputs:
 *Dynamic inputs, including:
-* @li x: A list of Tensor objects, each with same shape and type. The supported 
-types are:
+* @li x: A list of Tensor objects, each with same shape and type. The supported types are:
 *   float16, float32, double, int32, uint8, int16, int8, complex64, int64,
-*   qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n
+*   qint8, quint8, qint32, uint16, complex128, uint32, uint64. It's a dynamic input. \n
 
 *@par Outputs:
 *y: A Tensor. Has the same shape and type as the elements of "x". \n
@@ -122,8 +121,7 @@ REG_OP(MinimumGrad)
 
 *@par Inputs:
 *One input:
-*x:A Tensor. Must be one of the following types: bool, float16, float, int8, 
-int32, uint32, uint8,
+*x:A Tensor. Must be one of the following types: bool, float16, float, int8, int32, uint32, uint8,
    int64, uint64, int16, uint16, double, complex64, complex128, qint8, quint8, qint16, quint16, qint32. \n
 
 *@par Attributes:
@@ -387,8 +385,7 @@ REG_OP(Sign)
 
 *@par Inputs:
 *Two inputs, including: \n
-*@li x1: A Tensor. Must be one of the following types: float16, float32,
- float64, int32, int64, complex64,complex128
+*@li x1: A Tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64,complex128
 *@li x2: A Tensor. Has the same type as "x1". \n
 
 *@par Outputs:
@@ -487,16 +484,12 @@ REG_OP(Equal)
 
 *@par Inputs:
 *One input:\n
-*x: A Tensor. Must be one of the following types: float16, float32, double, 
-complex64, complex128. \n
+*x: A Tensor. Must be one of the following types: float16, float32, double, complex64, complex128. \n
 
 *@par Attributes:
-*@li base: An optional attribute of type float32, specifying the base gamma. 
-Defaults to "-1.0".
-*@li scale: An optional attribute of type float32, specifying the scale alpha. 
-Defaults to "1.0".
-*@li shift: An optional attribute of type float32, specifying the shift beta. 
-Defaults to "0.0". \n
+*@li base: An optional attribute of type float32, specifying the base gamma. Defaults to "-1.0".
+*@li scale: An optional attribute of type float32, specifying the scale alpha. Defaults to "1.0".
+*@li shift: An optional attribute of type float32, specifying the shift beta. Defaults to "0.0". \n
 
 *@par Outputs:
 *y: A Tensor of the same type as "x". \n
@@ -517,8 +510,7 @@ REG_OP(Exp)
 
 *@par Inputs:
 *One input:
-*x: A Tensor. Must be one of the following types: float16, float32, double, 
-complex64, complex128. \n
+*x: A Tensor. Must be one of the following types: float16, float32, double, complex64, complex128. \n
 
 *@par Outputs:
 *y: A Tensor of the same type as "x". \n
@@ -535,9 +527,7 @@ REG_OP(Expm1)
 *@brief: Computes the reciprocal of "x". \n
 
 *@par Inputs:\n
-*x: A Tensor. Must be one of the following types: float16, float32,
-int32, int64, double,
-complex64, complex128. \n
+*x: A Tensor. Must be one of the following types: float16, float32, int32, int64, double, complex64, complex128. \n
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x". \n
@@ -759,8 +749,7 @@ REG_OP(Xlogy)
 
 *@par Inputs:
 *One input: \n
-*x: A Tensor. Must be one of the following types: float16, float32, float64,
-int32, int64, complex64, complex128
+*x: A Tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64, complex128
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x". \n
@@ -801,8 +790,7 @@ REG_OP(Rsqrt)
 
 *
 *@par Inputs:
-* x: A tensor. Must be one of the following types: float16, float32, float64, 
-int32, int64, complex64, complex128.
+* x: A tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64, complex128.
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "x".
@@ -823,8 +811,7 @@ REG_OP(Asin)
 
 *
 *@par Inputs:
-*@li y: A tensor of type float16, float32, float64, 
-int32, int64, complex64, complex128.
+*@li y: A tensor of type float16, float32, float64, int32, int64, complex64, complex128.
 *@li dy: A tensor of the same type as "y".
 *
 *@attention Constraints:
@@ -851,8 +838,7 @@ REG_OP(AsinGrad)
 
 *
 *@par Inputs:
-* x: A tensor. Must be one of the following types: float16, float32, float64,
-int32, int64, complex64, complex128.
+* x: A tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64, complex128.
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "x".
@@ -897,8 +883,7 @@ REG_OP(AcosGrad)
 
 *
 *@par Inputs:
-* x: A tensor. Must be one of the following types: float16, float32, float64,
- complex64, complex128.
+* x: A tensor. Must be one of the following types: float16, float32, float64, complex64, complex128.
 *
 *@attention Constraints:
 * x Given an input tensor, the function computes inverse hyperbolic cosine of every element.\n
@@ -1175,8 +1160,7 @@ REG_OP(FusedMulAdd)
 
 *
 *@par Inputs:
-*@li x1: A tensor. Must be one of the following types: float16, float32, float64,
-uint8, int8, int16, int32, int64, complex64, complex128.
+*@li x1: A tensor. Must be one of the following types: float16, float32, float64, uint8, int8, int16, int32, int64, complex64, complex128.
 *@li x2: A tensor of the same type as "x1".
 *
 *@attention Constraints:
@@ -1205,8 +1189,7 @@ REG_OP(AddV2)
 *@brief Updates "ref" by adding "value" to it. \n
 
 *@par Inputs:
-*@li ref: A Tensor. Must be one of the following types: float16, float32, int8,
-int16, int32, int64, uint8, uint16, uint32, uint64.
+*@li ref: A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64.
 *@li value: A Tensor of the same type as "ref". \n
 
 *@par Attributes:
@@ -1235,14 +1218,12 @@ REG_OP(AssignAdd)
 *@brief Updates "ref" by assigning "value" to it. \n
 
 *@par Inputs:
-*@li ref: A Tensor. Must be one of the following types: float16, float32, int8, int16, 
-int32, int64, uint8, uint16, uint32, uint64.
+*@li ref: A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64.
 *@li value: A Tensor of the same type as "ref". \n
 
 *@par Attributes:
 *@li validate_shape: An optional bool. Defaults to "true".
-                     If "true", the operation will validate that the shape of "value"
-                     matches the shape of the Tensor being assigned to.
+                     If "true", the operation will validate that the shape of "value" matches the shape of the Tensor being assigned to.
 *                    If "false", "ref" will take on the shape of "value".
 *                    This attribute is reserved.
 *@li use_locking: An optional bool. Defaults to True.
@@ -1271,8 +1252,7 @@ REG_OP(Assign)
 
 *
 *@par Inputs:
-*@li var: A tensor. Must be one of the following types: float32, float64,
-int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128, uint32, uint64
+*@li var: A tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128, uint32, uint64
 *@li value: A tensor of the same type as "var".
 *
 *@par Attributes:
@@ -1664,9 +1644,7 @@ REG_OP(Atan2)
 
 *
 *@par Inputs:
-*@li x1: A tensor. Must be one of the following types: float32, float64, int32,
- uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128,
-float16, uint32, uint64
+*@li x1: A tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128, float16, uint32, uint64
 *@li x2: A tensor of the same type as "x1".
 *
 *@par Attributes:
@@ -1688,18 +1666,16 @@ REG_OP(ApproximateEqual)
 
 /**
 *@brief Returns the element-wise sum of a list of tensors.\n
-* AccumulateNV2 performs the same operation as AddN, but does not wait for all 
-of its inputs to be ready before beginning to sum.\n This can save memory if 
-inputs are ready at different times, \n since minimum temporary storage is 
-proportional to the output size rather than the inputs size.\n Returns a Tensor 
-of same shape and type as the elements of inputs. \n
+* AccumulateNV2 performs the same operation as AddN, but does not wait for all of its inputs
+to be ready before beginning to sum.\n This can save memory if inputs are ready at different times,
+since minimum temporary storage is proportional to the output size rather than the inputs size.
+ Returns a Tensor of same shape and type as the elements of inputs. \n
 
 *
 *@par Inputs:
 *Dynamic inputs, including:
-* x: A tensor. Must be one of the following types: float32, float64, int32, 
-uint8, int16, int8, complex64, int64, \n qint8, quint8, qint32, uint16, 
-complex128, float16, uint32, uint64.
+* x: A tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64,
+qint8, quint8, qint32, uint16, complex128, float16, uint32, uint64. It's a dynamic input. \n
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "x".
@@ -1755,8 +1731,7 @@ REG_OP(FakeQuantWithMinMaxArgs)
 
 *@par Inputs:
 *Two inputs, including: \n
-*@li gradients: A Tensor of type float32. Backpropagated gradients 
-above the FakeQuantWithMinMaxArgs operation.
+*@li gradients: A Tensor of type float32. Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
 *@li x: A Tensor of type float32. Has the same type and format as "gradients".\n
 * This is the input Tensor of the FakeQuantWithMinMaxArgs operator.\n
 
@@ -2235,13 +2210,9 @@ REG_OP(BiasAdd)
 
 *@par Inputs:
 *Two inputs, including:
-*@li x: A Tensor. Must be one of the following types: float32, float64, int32, 
-uint8, int16, int8, complex64, int64, qint8, quint8, qint32, bfloat16, uint16, 
-complex128, float16, uint32, uint64.
+*@li x: A Tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, bfloat16, uint16, complex128, float16, uint32, uint64.
 *format is ND.
-*@li dimension: A Tensor. Must be one of the following types: int32, int64. 
-Must be in the range [-rank(input x), rank(input x)]. Describes which dimension 
-of the input Tensor to reduce across.
+*@li dimension: A Tensor. Must be one of the following types: int32, int64. Must be in the range [-rank(input x), rank(input x)]. Describes which dimension of the input Tensor to reduce across.
 * The format is ND.
 *@par Attributes:
 *dtype: The output type, either "int32" or "int64". Defaults to "int64". \n
@@ -2315,7 +2286,6 @@ REG_OP(ArgMaxV2)
     .ATTR(dtype, Type, DT_INT64)
     .OP_END_FACTORY_REG(ArgMaxV2)
 
-
 /**
 *@brief Returns the index with the largest value across axes of a tensor. \n
 
@@ -2328,16 +2298,15 @@ REG_OP(ArgMaxV2)
 *@li dtype: The output type, either "int32" or "int64". Defaults to "int64". \n
 
 *@par Outputs:
-*y: A multi-dimensional Tensor of type int32, specifying the index with the 
-largest value. The dimension is one less than that of "x". \n
+*y: A multi-dimensional Tensor of type int32, specifying the index with the largest value. The dimension is one less than that of "x". \n
 
 *@attention Constraints:
 *@li x: If there are multiple maximum values, the index of the first maximum value is used.
-*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the 
-dimension length of "x". \n
+*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". \n
 
 *@par Third-party framework compatibility
 * Compatible with TensorFlow operator ArgMax.
+*
 * @par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
@@ -2960,13 +2929,9 @@ REG_OP(FusedMulAddN)
 *@li bias: An ND tensor of type float16 or float32. \n
 
 *@par Attributes:
-*@li axis: An optional int32 used to compute the shape of bias input from the 
-online bottoms. Defaults to "1".
-*@li num_axes: An optional int32 used to compute the shape of bias input from a 
-Caffe model trained offline. Defaults to "1".
-*@li bias_from_blob: An optional bool. If "true", bias is input from a Caffe 
-model trained offline. If "false", bias is input from online bottoms. Defaults 
-to "true". \n
+*@li axis: An optional int32 used to compute the shape of bias input from the online bottoms. Defaults to "1".
+*@li num_axes: An optional int32 used to compute the shape of bias input from a Caffe model trained offline. Defaults to "1".
+*@li bias_from_blob: An optional bool. If "true", bias is input from a Caffe model trained offline. If "false", bias is input from online bottoms. Defaults to "true". \n
 
 *@par Outputs:
 *y: An ND tensor of type float16 or float32. \n
@@ -2974,25 +2939,13 @@ to "true". \n
 *@attention Constraints:\n
 * Assume that the shape length of "x" is "n" and that of "bias" is "m".
 *@li "axis" is within the range [-n, n-1]. num_axes >= -1.
-*@li If "bias_from_blob = true", "num_axes = -1", and "axis >= 0", the ith axis 
-of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < 
-n-axis).\n
-* If "axis < 0", the ith axis of "bias" and the (i+n+"axis")th axis of "x" must 
-have the same size (0 <= i < -axis).
-*@li If "bias_from_blob = true" and "num_axes = 0", "bias" is a scalar with 
-shape length 1 and dimension size 1.
-*@li If "bias_from_blob = true", "num_axes > 0, and "axis >= 0", "axis + 
-num_axes" must be less than or equal to "n" and the ith axis of "bias" and the 
-(i+"axis")th axis of "x" must have the same size (0 <= i < num_axes).\n
-* If "axis < 0", "n + axis + num_axes" must be less than or equal to "n" and 
-the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same 
-size (0 <= i < num_axes).
-*@li If "bias_from_blob = false", "bias" is not a scalar, and "axis >= 0","axis 
-+ m" must be less than or equal to "n" and the ith axis of "bias" and the (i
-+"axis")th axis of "x" must have the same size (0 <= i < m).\n
-* If "axis < 0", "n + axis + m" must be less than or equal to "n" and the ith 
-axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= 
-i < m).
+*@li If "bias_from_blob = true", "num_axes = -1", and "axis >= 0", the ith axis of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < n-axis).\n
+* If "axis < 0", the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < -axis).
+*@li If "bias_from_blob = true" and "num_axes = 0", "bias" is a scalar with shape length 1 and dimension size 1.
+*@li If "bias_from_blob = true", "num_axes > 0, and "axis >= 0", "axis + num_axes" must be less than or equal to "n" and the ith axis of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < num_axes).\n
+* If "axis < 0", "n + axis + num_axes" must be less than or equal to "n" and the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < num_axes).
+*@li If "bias_from_blob = false", "bias" is not a scalar, and "axis >= 0","axis + m" must be less than or equal to "n" and the ith axis of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < m).\n
+* If "axis < 0", "n + axis + m" must be less than or equal to "n" and the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < m).
 *@par Third-party framework compatibility
 * Compatible with the Caffe operator Bias.
 */
@@ -3070,12 +3023,10 @@ REG_OP(FusedMulAddNL2loss)
 *@li x: A Tensor with any format. Must be one of the following types: float16, float32. \n
 
 *@par Attributes:
-*@li threshold: A required float32. Defaults to "0.0". "x" is compared with 
-"threshold", outputs "1" for inputs above threshold; "0" otherwise. \n
+*@li threshold: A required float32. Defaults to "0.0". "x" is compared with "threshold", outputs "1" for inputs above threshold; "0" otherwise. \n
 
 *@par Outputs:
-*@li y: A Tensor with any format. Has the same type as the input. Must be one 
-of the following types: float16, float32.
+*@li y: A Tensor with any format. Has the same type as the input. Must be one of the following types: float16, float32.
 *@par Third-party framework compatibility
 * Compatible with the Caffe operator Threshold.
 */
@@ -3093,16 +3044,11 @@ of the following types: float16, float32.
 *@li x: A tensor. Must be one of the following types: float16, float32. \n
 
 *@par Attributes:
-*@li axis: An optional int. Specify the axis to be cut at the input tensor. If 
-this parameter is not provided, find the topk for each batch. Defaults to 10000
-*@li out_max_val: An optional bool. Whether to output the maximum value. If it 
-is True, the maximum value and index are output, otherwise only the index is 
-output.
+*@li axis: An optional int. Specify the axis to be cut at the input tensor. If this parameter is not provided, find the topk for each batch. Defaults to 10000
+*@li out_max_val: An optional bool. Whether to output the maximum value. If it is True, the maximum value and index are output, otherwise only the index is output.
 * Defaults to False
-*@li topk: An optional int. It means the number of top tok in each axis (the 
-value is greater than or equal to 1), and the value range must be in [1,x.shape
-(axis)].
-* Defaults to 1 \n
+*@li topk: An optional int. It means the number of top tok in each axis (the value is greater than or equal to 1), and the value range must be in [1,x.shape(axis)].
+* Defaults to 1
 
 *@par Outputs:
 *@li indices: A tensor of type float16, float32, int32. The index of the maximum value of the output.
@@ -3222,8 +3168,7 @@ REG_OP(Axpy)
     .OP_END_FACTORY_REG(Axpy)
 
 /**
-*@brief Creates a criterion that measures the loss given input tensors x1 x2 
-and a Tensor label y with values 1 or -1. \n
+*@brief Creates a criterion that measures the loss given input tensors x1 x2 and a Tensor label y with values 1 or -1. \n
 
 *@par Inputs:
 *@li x1: A ND Tensor with one of the following types: int8, uint8, int32, float16, float32.
@@ -3314,4 +3259,4 @@ REG_OP(TensorRedirect)
 
 
 
-#endif  // GE_OP_ELEWISE_CALCULATION_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_ELEWISE_CALCULATION_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/functional_ops.h b/third_party/fwkacllib/inc/ops/functional_ops.h
index bf5ebd51..598d3ad3 100644
--- a/third_party/fwkacllib/inc/ops/functional_ops.h
+++ b/third_party/fwkacllib/inc/ops/functional_ops.h
@@ -18,8 +18,8 @@
  * \file functional_ops.h
  * \brief
  */
-#ifndef GE_FUNCTIONAL_OPS_H_
-#define GE_FUNCTIONAL_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_FUNCTIONAL_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_FUNCTIONAL_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -36,7 +36,7 @@ namespace ge {
  *          if "cond" is a numerical scalar, non-zero means True and zero means False;
  *          if "cond" is a string scalar, non-empty means True and empty means False;
  *          if "cond" is not a scalar, non-empty means True and empty means False.
- *@li input: The input tensors . \n
+ *@li input: The input tensors . It's a dynamic input. \n
 
  *@par Graphs:
  *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
@@ -69,7 +69,7 @@ REG_OP(_If)
  *          if "cond" is a numerical scalar, non-zero means True and zero means False;
  *          if "cond" is a string scalar, non-empty means True and empty means False;
  *          if "cond" is not a scalar, non-empty means True and empty means False.
- *@li input: The input tensors . \n
+ *@li input: The input tensors . It's a dynamic input. \n
 
  *@par Graphs:
  *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
@@ -102,7 +102,7 @@ REG_OP(StatelessIf)
  *          if "cond" is a numerical scalar, non-zero means True and zero means False;
  *          if "cond" is a string scalar, non-empty means True and empty means False;
  *          if "cond" is not a scalar, non-empty means True and empty means False.
- *@li input: The input tensors . \n
+ *@li input: The input tensors . It's a dynamic input. \n
 
  *@par Graphs:
  *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
@@ -129,7 +129,7 @@ REG_OP(If)
 
  *@par Inputs:
  *@li branch_index: A int32 scalar which determines the selected subgraph.
- *@li input: The input tensors, which will be passed to the subgraph . \n
+ *@li input: The input tensors, which will be passed to the subgraph . It's a dynamic input. \n
 
  *@par Graphs:
  *branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors,
@@ -152,7 +152,7 @@ REG_OP(Case)
  *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n
 
  *@par Inputs:
- *input: The input tensors . \n
+ *input: The input tensors . It's a dynamic input. \n
 
  *@par Graphs:
  *@li cond: A subgraph takes 'input' and returns a tensor.
@@ -183,7 +183,7 @@ REG_OP(_While)
  *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n
 
  *@par Inputs:
- *input: The input tensors . \n
+ *input: The input tensors . It's a dynamic input. \n
 
  *@par Graphs:
  *@li cond: A subgraph takes 'input' and returns a tensor.
@@ -215,7 +215,7 @@ REG_OP(While)
  *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n
 
  *@par Inputs:
- *input: The input tensors . \n
+ *input: The input tensors . It's a dynamic input. \n
 
  *@par Graphs:
  *@li cond: A subgraph takes 'input' and returns a tensor.
@@ -250,7 +250,7 @@ REG_OP(StatelessWhile)
  *@li start: A int32 scalar. The lower bound.
  *@li limit: A int32 scalar. The upper bound.
  *@li delta: A int32 scalar. The step size.
- *@li input: The input tensors, which will be passed to "body" . \n
+ *@li input: The input tensors, which will be passed to "body" . It's a dynamic input. \n
 
  *@par Graphs:
  *body: A subgraph takes 'input' and returns a another list of tensors . \n
@@ -274,7 +274,7 @@ REG_OP(For)
  *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n
 
  *@par Inputs:
- *args: The input tensors, which will be passed to "f" . \n
+ *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n
 
  *@par Graphs:
  *f: A subgraph takes 'args' and returns a another list of tensors . \n
@@ -303,7 +303,7 @@ REG_OP(PartitionedCall)
  *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n
 
  *@par Inputs:
- *args: The input tensors, which will be passed to "f" . \n
+ *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n
 
  *@par Graphs:
  *f: A subgraph takes 'args' and returns a another list of tensors . \n
@@ -330,4 +330,4 @@ REG_OP(StatefulPartitionedCall)
 
 }  // namespace ge
 
-#endif  // GE_FUNCTIONAL_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_FUNCTIONAL_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/get_data_ops.h b/third_party/fwkacllib/inc/ops/get_data_ops.h
index 33a64903..33dc4f14 100644
--- a/third_party/fwkacllib/inc/ops/get_data_ops.h
+++ b/third_party/fwkacllib/inc/ops/get_data_ops.h
@@ -18,8 +18,8 @@
  * \file get_data_ops.h
  * \brief
  */
-#ifndef GE_OP_GET_DATA_OPS_H_
-#define GE_OP_GET_DATA_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_GET_DATA_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_GET_DATA_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -100,4 +100,4 @@ REG_OP(DeviceQueueDataset)
 } // namespace ge
 
 
-#endif  // GE_OP_GET_DATA_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_GET_DATA_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/hcom_ops.h b/third_party/fwkacllib/inc/ops/hcom_ops.h
index 7e985efc..1fe9055c 100644
--- a/third_party/fwkacllib/inc/ops/hcom_ops.h
+++ b/third_party/fwkacllib/inc/ops/hcom_ops.h
@@ -18,8 +18,8 @@
  * \file hcom_ops.h
  * \brief huawei collective communication library ops.
  */
-#ifndef GE_OP_HCOM_OPS_H_
-#define GE_OP_HCOM_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_HCOM_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_HCOM_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -41,8 +41,8 @@ namespace ge {
   as the name of a world group.
  */
 REG_OP(HcomAllGather)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
+    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
     .REQUIRED_ATTR(rank_size, Int)
     .REQUIRED_ATTR(group, String)
     .ATTR(alpha, Float, 1.0)
@@ -99,8 +99,8 @@ REG_OP(HcomAllReduce)
   as the name of a world group.
  */
 REG_OP(HcomBroadcast)
-    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
-    .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
+    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
+    .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
     .REQUIRED_ATTR(root_rank, Int)
     .REQUIRED_ATTR(group, String)
     .ATTR(alpha, Float, 1.0)
@@ -157,7 +157,7 @@ REG_OP(HcomReduceScatter)
  * @see HcomReceive
 */
 REG_OP(HcomSend)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
+    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
     .REQUIRED_ATTR(group, String)
     .REQUIRED_ATTR(sr_tag, Int)
     .REQUIRED_ATTR(dest_rank, Int)
@@ -190,7 +190,7 @@ REG_OP(HcomSend)
  * @see HcomSend
 */
 REG_OP(HcomReceive)
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
     .REQUIRED_ATTR(group, String)
     .REQUIRED_ATTR(sr_tag, Int)
     .REQUIRED_ATTR(src_rank, Int)
@@ -200,5 +200,30 @@ REG_OP(HcomReceive)
     .ATTR(beta, Float, 0.0)
     .OP_END_FACTORY_REG(HcomReceive)
 
+/**
+ * @brief Performs Remote Read of input tensors
+ * @par Inputs:
+ * remote: A tensor. describing the remote memory address to read: u64 remoteId, u64 addrRemote, u64 length
+ * @par Outputs:
+ * local: A Tensor. whose value is length / size_of(Type)
+ */
+REG_OP(HcomRemoteRead)
+    .INPUT(remote, TensorType({DT_INT64, DT_UINT64}))
+    .OUTPUT(local, TensorType::ALL())
+    .REQUIRED_ATTR(dtype, Type)
+    .OP_END_FACTORY_REG(HcomRemoteRead)
+
+/**
+ * @brief Performs Remote Write of input tensors
+ * @par Inputs:
+ * remote: A tensor. describing the remote memory address to write: u64 remoteId, u64 addrRemote, u64 length
+ * @par Inputs:
+ * local: A Tensor. whose value is length / size_of(Type)
+ */
+REG_OP(HcomRemoteWrite)
+    .INPUT(remote, TensorType({DT_INT64, DT_UINT64}))
+    .INPUT(local, TensorType::ALL())
+    .OP_END_FACTORY_REG(HcomRemoteWrite)
+
 } // namespace ge
-#endif // GE_OP_HCOM_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_HCOM_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/hvd_ops.h b/third_party/fwkacllib/inc/ops/hvd_ops.h
index bde8486c..a49ec5ed 100644
--- a/third_party/fwkacllib/inc/ops/hvd_ops.h
+++ b/third_party/fwkacllib/inc/ops/hvd_ops.h
@@ -18,8 +18,8 @@
  * \file hvd_ops.h
  * \brief Horovod collective communication library ops.
  */
-#ifndef GE_OP_HVD_OPS_H_
-#define GE_OP_HVD_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_HVD_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_HVD_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -78,4 +78,4 @@ REG_OP(HorovodBroadcast)
     .OP_END_FACTORY_REG(HorovodBroadcast)
 
 } // namespace ge
-#endif // GE_OP_HVD_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_HVD_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 302823a2..ce3262f9 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -18,8 +18,8 @@
  * \file image_ops.h
  * \brief
  */
-#ifndef GE_OP_MAGE_OPS_H_
-#define GE_OP_MAGE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -160,10 +160,8 @@ REG_OP(CropAndResize)
 *@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with int32 values in [0, batch) . \n
 
 *@par Attributes:
-*@li crop_size: list int. [crop_height, crop_width]. All cropped image patches 
-are resized to this size.
-*@li extrapolation_value: An optional float. Defaults to 0. Value used for 
-extrapolation, when applicable.
+*@li crop_size: list int. [crop_height, crop_width]. All cropped image patches are resized to this size.
+*@li extrapolation_value: An optional float. Defaults to 0. Value used for extrapolation, when applicable.
 *@li method: An optional string from: '"bilinear"'. Defaults to "bilinear" . \n
 
 *@par Outputs:
@@ -174,6 +172,7 @@ extrapolation, when applicable.
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow CropAndResize operator.
+
 * @par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use CropAndResize instead.
 */
@@ -1345,4 +1344,4 @@ REG_OP(SpatialTransformerD)
 
 }  // namespace ge
 
-#endif  // GE_OP_MAGE_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/internal_ops.h b/third_party/fwkacllib/inc/ops/internal_ops.h
index 7e9fd4a4..9dde14a5 100644
--- a/third_party/fwkacllib/inc/ops/internal_ops.h
+++ b/third_party/fwkacllib/inc/ops/internal_ops.h
@@ -18,8 +18,8 @@
  * \file internal_ops.h
  * \brief
  */
-#ifndef GE_OP_INTERNAL_OPS_H_
-#define GE_OP_INTERNAL_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_INTERNAL_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_INTERNAL_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -81,4 +81,4 @@ REG_OP(InternalDataMove)
 
 }  // namespace ge
 
-#endif  // GE_OP_INTERNAL_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_INTERNAL_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/linalg_ops.h b/third_party/fwkacllib/inc/ops/linalg_ops.h
index 5d98f999..7a6fbc59 100644
--- a/third_party/fwkacllib/inc/ops/linalg_ops.h
+++ b/third_party/fwkacllib/inc/ops/linalg_ops.h
@@ -18,8 +18,8 @@
  * \file linalg_ops.h
  * \brief
  */
-#ifndef GE_OP_LINALG_OPS_H_
-#define GE_OP_LINALG_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_LINALG_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_LINALG_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -432,4 +432,4 @@ REG_OP(TridiagonalSolve)
 
 }  // namespace ge
 
-#endif  // GE_OP_LINALG_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_LINALG_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/logging_ops.h b/third_party/fwkacllib/inc/ops/logging_ops.h
index db9097ce..bc8ae2b8 100644
--- a/third_party/fwkacllib/inc/ops/logging_ops.h
+++ b/third_party/fwkacllib/inc/ops/logging_ops.h
@@ -18,8 +18,8 @@
  * \file logging_ops.h
  * \brief
  */
-#ifndef GE_OP_LOGGING_OPS_H
-#define GE_OP_LOGGING_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_LOGGING_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_LOGGING_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -113,4 +113,4 @@ REG_OP(PrintV2)
   .OP_END_FACTORY_REG(PrintV2)
 }  // namespace ge
 
-#endif  // GE_OP_LOGGING_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_LOGGING_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/lookup_ops.h b/third_party/fwkacllib/inc/ops/lookup_ops.h
index 84b138c4..b37ab048 100644
--- a/third_party/fwkacllib/inc/ops/lookup_ops.h
+++ b/third_party/fwkacllib/inc/ops/lookup_ops.h
@@ -18,8 +18,8 @@
  * \file lookup_ops.h
  * \brief
  */
-#ifndef GE_OP_LOOKUP_OPS_H_
-#define GE_OP_LOOKUP_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_LOOKUP_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_LOOKUP_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -305,4 +305,4 @@ REG_OP(MutableHashTable)
     .OP_END_FACTORY_REG(MutableHashTable)
 }   // namespace ge
 
-#endif  // GE_OP_LOOKUP_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_LOOKUP_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h
index 3d7ff1d9..ff51bb2f 100644
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -18,8 +18,8 @@
  * \file math_ops.h
  * \brief
  */
-#ifndef GE_OP_MATH_OPS_H_
-#define GE_OP_MATH_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_MATH_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_MATH_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -511,6 +511,23 @@ REG_OP(IsFinite)
     .OUTPUT(y, TensorType({DT_BOOL}))
     .OP_END_FACTORY_REG(IsFinite)
 
+/**
+ * *@brief Compute element-wise infiniteness, return a boolean tensor.
+ *
+ * *@par Inputs:
+ * *x:A Tensor.
+ *
+ * *@par Outputs:
+ * *y:A Tensor. Has the same shape as x.
+ *
+ * *@par Third-party framework compatibility.
+ * *Compatible with tensorflow IsInf operator.
+ * */
+REG_OP(IsInf)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_BOOL}))
+    .OP_END_FACTORY_REG(IsInf)
+
 /**
  * *@brief Computes the complex absolute value of a tensor.
  *
@@ -675,6 +692,137 @@ REG_OP(IFMR)
   .REQUIRED_ATTR(search_step, Float)
   .REQUIRED_ATTR(with_offset, Bool)
   .OP_END_FACTORY_REG(IFMR)
+
+/**
+*@brief weights adaptive range quantization. \n
+
+*@par Inputs:
+*@li w:A Tensor of weights. \n
+
+*@par Attributes:
+*axes: specify channel.
+*num_bits: the bits num used for quantize.
+*offset_flag: whether using offset. \n
+
+*@par Outputs:
+*scale: quantization factor scale.
+*offset: quantization factor offset.
+*y: fake quantized weights. \n
+
+*@par Third-party framework compatibility
+*Compatible with mindspore
+*/
+
+REG_OP(WtsARQ)
+  .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(scale, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(offset, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .ATTR(axes, ListInt, {0})
+  .ATTR(num_bits, Int, 8)
+  .ATTR(offset_flag, Bool, false)
+  .OP_END_FACTORY_REG(WtsARQ)
+
+/**
+*@brief The acts_ulq. \n
+
+*@par Inputs:
+*@li x:A Tensor of feature map
+*@li clamp _min:A Tensor of min clamp value of feature map.
+*@li clamp _max:A Tensor of max clamp value of feature map.
+
+*@par Attributes:
+*fixed_min: fix min to zero.
+*num_bits: quant bits. \n
+
+*@par Outputs:
+*y: output fake quant feature map.
+*clamp_min_mask: where x > clamp_min
+*clamp_min_mask: where x < clamp_max
+*x_clamped_loss: clamp loss. \n
+
+*@par Third-party framework compatibility
+*Compatible with mindspore
+*/
+
+REG_OP(ActsULQ)
+  .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(clamp_min, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(clamp_max, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(clamp_min_mask, TensorType({DT_BOOL}))
+  .OUTPUT(clamp_max_mask, TensorType({DT_BOOL}))
+  .OUTPUT(x_clamped_loss, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .ATTR(fixed_min, Bool, false)
+  .ATTR(num_bits, Int, 8)
+  .OP_END_FACTORY_REG(ActsULQ)
+
+/**
+*@brief The acts_ulq_input_grad. \n
+
+*@par Inputs:
+*@li y_grad: A Tensor of gradient
+*@li clamp_min_mask: A Tensor of boolean mask indicating whether an additional one is needed'
+*@li clamp_max_mask: A Tensor of boolean mask indicating whether an additional one is needed'
+
+*@par Outputs:
+*x_grapd: The gradient of inpust. \n
+
+*@par Third-party framework compatibility
+*Compatible with mindspore
+*/
+
+REG_OP(ActsULQInputGrad)
+  .INPUT(y_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(clamp_min_mask, TensorType({DT_BOOL}))
+  .INPUT(clamp_max_mask, TensorType({DT_BOOL}))
+  .OUTPUT(x_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OP_END_FACTORY_REG(ActsULQInputGrad)
+
+/**
+*@brief The act_ulq_clamp_max_grad. \n
+
+*@par Inputs:
+*@li y_grad: A Tensor of gradient
+*@li clamp_max_mask: A Tensor of boolean mask indicating whether an additional one is needed.
+*@li x_clamped_loss: A Tensor of gradient. \n
+
+*@par Outputs:
+*clamp_max_grad: The gradient of clamp max. \n
+
+*@par Third-party framework compatibility
+*Compatible with mindspore
+*/
+
+REG_OP(ActULQClampMaxGrad)
+  .INPUT(y_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(clamp_max_mask, TensorType({DT_BOOL}))
+  .INPUT(x_clamped_loss, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(clamp_max_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OP_END_FACTORY_REG(ActULQClampMaxGrad)
+
+/**
+*@brief The act_ulq_clamp_min_grad. \n
+
+*@par Inputs:
+*@li y_grad: A Tensor of gradient
+*@li clamp_min_mask: A Tensor of boolean mask indicating whether an additional one is needed.
+*@li x_clamped_loss: A Tensor of gradient. \n
+
+*@par Outputs:
+*clamp_min_grad: The gradient of clamp min. \n
+
+*@par Third-party framework compatibility
+*Compatible with mindspore
+*/
+
+REG_OP(ActULQClampMinGrad)
+  .INPUT(y_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .INPUT(clamp_min_mask, TensorType({DT_BOOL}))
+  .INPUT(x_clamped_loss, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OUTPUT(clamp_min_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
+  .OP_END_FACTORY_REG(ActULQClampMinGrad)
+
 }  // namespace ge
 
-#endif  // GE_OP_MATH_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_MATH_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
index bceff0cd..ed23d3f6 100644
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -18,8 +18,8 @@
  * \file matrix_calculation_ops.h
  * \brief
  */
-#ifndef GE_OP_MATRIX_CALCULATION_OPS_H
-#define GE_OP_MATRIX_CALCULATION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_MATRIX_CALCULATION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_MATRIX_CALCULATION_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -95,6 +95,10 @@ REG_OP(MatMulV2)
 /**
 *@brief Performs Matrix-to-matrix Multiply, producing c=alpha[0]*a*b+beta[0]*c . \n
 
+*@attention Constraints:
+* For better performance, The k-axis must be aligned to 16 (input type
+* is float16) or 32 (input type is int8). \n
+
 *@par Inputs:
 *Five inputs, including:
 *@li a: A matrix Tensor. Must be one of the following types: float16, int8.
@@ -398,8 +402,8 @@ REG_OP(TensorScatterUpdate)
 *Must be one of the following types: float16, float32, int32, int8, uint8
 
 *@par Attributes:
-*use_locking: An optional bool. Defaults to "False". If "True", the operation
- * will be protected by a lock . \n
+* use_locking: An optional bool. Defaults to "False". If "True", the operation
+* will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -430,7 +434,7 @@ REG_OP(ScatterAdd)
 
 *@par Attributes:
 *@li use_locking: An optional bool. Defaults to "False". If "True",
- * the operation will be protected by a lock . \n
+* the operation will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -459,7 +463,7 @@ REG_OP(ScatterDiv)
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True",
- * the operation will be protected by a lock . \n
+* the operation will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -488,7 +492,7 @@ REG_OP(ScatterNdAdd)
 *Must be one of the following types: int32
 *@li updates: An ND Tensor. \n
 
-*Must be one of the following types: float16, float32, int32, int8, uint8
+* Must be one of the following types: float16, float32, int32, int8, uint8
 
 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x" . \n
@@ -517,10 +521,10 @@ REG_OP(TensorScatterAdd)
 
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True",
- * the operation will be protected by a lock . \n
+* the operation will be protected by a lock . \n
 
 *@par Outputs:
-*var: A Tensor. Has the same type and format as input "var" . \n
+* var: A Tensor. Has the same type and format as input "var" . \n
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ScatterNdSub.
@@ -549,7 +553,7 @@ REG_OP(ScatterNdSub)
 *Must be one of the following types: float16, float32, int32, int8, uint8
 
 *@par Outputs:
-*y: A Tensor. Has the same type and format as input "x" . \n
+* y: A Tensor. Has the same type and format as input "x" . \n
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator TensorScatterSub.
@@ -574,10 +578,10 @@ REG_OP(TensorScatterSub)
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True",
- * the operation will be protected by a lock . \n
+* the operation will be protected by a lock . \n
 
 *@par Outputs:
-*var: A Tensor. Has the same type and format as input "var" . \n
+* var: A Tensor. Has the same type and format as input "var" . \n
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ScatterSub.
@@ -647,7 +651,7 @@ REG_OP(DiagPart)
 *@li num_output: Reserved.
 *@li transpose: A bool, specifying weight whether to transpose, either "true" or "false". Defaults to "false".
 *@li axis: Optional. A int, 1 or 2, specifying which dimension the input "K" starts from. Defaults to 1.
- * The product of the subsequent dimensions starting form first dimension or the second dimension is "K".
+* The product of the subsequent dimensions starting form first dimension or the second dimension is "K".
 *@li offset_x: Reserved . \n
 
 *@par Outputs:
@@ -764,7 +768,7 @@ REG_OP(ConfusionMatrix)
 
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True", the operation
- * will be protected by a lock . \n
+* will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -797,7 +801,7 @@ REG_OP(ScatterMul)
 
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True", the operation
- * will be protected by a lock . \n
+* will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -830,7 +834,7 @@ REG_OP(ScatterMin)
 
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False".
- * If "True", the operation will be protected by a lock . \n
+* If "True", the operation will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -863,7 +867,7 @@ REG_OP(ScatterMax)
 
 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True",
- * the operation will be protected by a lock . \n
+* the operation will be protected by a lock . \n
 
 *@par Outputs:
 *var: A Tensor. Has the same type and format as input "var" . \n
@@ -977,4 +981,4 @@ REG_OP(MatrixDiagV2)
 
 }  // namespace ge
 
-#endif  // GE_OP_MATRIX_CALCULATION_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_MATRIX_CALCULATION_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
index 073d541d..0c6a5dff 100644
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -18,8 +18,8 @@
  * \file nn_batch_norm_ops.h
  * \brief
  */
-#ifndef GE_OP_NN_BATCH_NORM_OPS_H
-#define GE_OP_NN_BATCH_NORM_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_BATCH_NORM_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_BATCH_NORM_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -87,58 +87,39 @@ REG_OP(L2NormalizeGrad)
 
 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW 
-for 4D or NC1HWC0 for 5D.
-*@li scale: A Tensor of type float32. Must be 1D if input "x" is with format 
-NHWC or NCHW. Must be 5D
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the offset.
-*@li mean: A Tensor of type float32. Must be 1D if input "x" is with format 
-NHWC or NCHW. Must be 5D
-if input "x" is with format NC1HWC0. Specifies the mean used for inference. 
-Must be "None" if the
+*@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
 operation is used for training.
-*@li variance: A Tensor of type float32. Must be 1D if input "x" is with format 
-NHWC or NCHW. Must be
-5D if input "x" is with format NC1HWC0. Specifies the variance used for 
-inference. Must be "None"
+*@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be
+5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n
 
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance 
-to avoid dividing by zero. Defaults to "0.0001".
-*@li data_format: An optional string, specifying the format of "x". Defaults to 
-"NHWC".
-*@li is_training: An optional bool, specifying if the operation is used for 
-training or inference. Defaults to "True" . \n
+*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001".
+*@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
+*@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n
 
 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
-*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", 
-with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
-*@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with 
-format NHWC or NCHW. Must be 5D
+*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean of "x".
-*@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with 
-format NHWC or NCHW.
+*@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
-*@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input 
-"x" is with format NHWC or NCHW.
-Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for 
-gradient computation. Pass "None" to skip this output.
-*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input 
-"x" is with format NHWC or NCHW.
-Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" 
-for gradient computation. Pass "None" to skip this output . \n
+*@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
+*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n
 
 *@attention Constraints:
-*@li If the operation is used for inference and outputs "reserve_space_1" and 
-"reserve_space_2" are available,
-then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has 
-the same value as "variance".
-*@li For Ascend 310, the result accuracy fails to reach 1� due to the square 
-root instruction . \n
+*@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
+then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has the same value as "variance".
+*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n
 
 *@par Third-party framework compatibility
 *@li Compatible with the TensorFlow operator fused_batch_norm.
@@ -185,17 +166,13 @@ is used for training or inference. Defaults to "True" . \n
 *@li y: A 4D Tensor of type float16 or float32, for the normalized "x".
 *@li batch_mean: A 1D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 1D Tensor of type float32, for the variance of "x".
-*@li reserve_space_1: A 1D Tensor of type float32, for the mean of "x" for
-gradient computation.
-*@li reserve_space_2: A 1D Tensor of type float32, for the variance of "x" 
-for gradient computation . \n
+*@li reserve_space_1: A 1D Tensor of type float32, for the mean of "x" for gradient computation.
+*@li reserve_space_2: A 1D Tensor of type float32, for the variance of "x" for gradient computation . \n
 
 *@attention Constraints:
 *@li If the operation is used for inference, then output "reserve_space_1"
-has the same value as "mean" and output "reserve_space_2" has the same value as
-"variance".
-*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square 
-root instruction . \n
+has the same value as "mean" and output "reserve_space_2" has the same value as "variance".
+*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator fused_batch_norm_v2.
@@ -221,34 +198,23 @@ REG_OP(BatchNormExt2)
 
 *@par Inputs:
 * Five inputs, including:
-*@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format 
-NHWC, NCHW, or NC1HWC0, for the gradient.
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, 
-or NC1HWC0.
-*@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or 
-NC1HWC0.
-*@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, 
-NCHW, or NC1HWC0. It is an output of BatchNorm.
-*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, 
-NCHW, or NC1HWC0. It is an output of BatchNorm . \n
+*@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the gradient.
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0.
+*@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0.
+*@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm.
+*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm . \n
 
 *@par Attributes:
-*@li epsilon: An optional float32. Defaults to "0.0001". A small float number 
-added to the variance of "x".
+*@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x".
 *@li data_format: An optional string. Defaults to "NHWC".
 *@li is_training: An optional bool. Defaults to "true". Specifies the operation is for training (default) or inference . \n
 
 *@par Outputs:
-*@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, 
-or NC1HWC0, for the offset of "x".
-*@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, or 
-NC1HWC0, for the offset of "scale".
-*@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, or 
-NC1HWC0, for the offset of "offset".
-*@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW, or 
-NC1HWC0. Pass "None" to skip this output.
-*@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW, or 
-NC1HWC0. Pass "None" to skip this output . \n
+*@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "x".
+*@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "scale".
+*@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "offset".
+*@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output.
+*@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output . \n
 
 *@attention Constraints:
 * The preceding layer of this operator must be operator BatchNorm . \n
@@ -278,28 +244,21 @@ REG_OP(BatchNormGrad)
 
 *@par Inputs:
 * Five inputs, including:
-*@li y_backprop: A 4D Tensor of type float16 or float32, with format NHWC or 
-NCHW, for the gradient.
+*@li y_backprop: A 4D Tensor of type float16 or float32, with format NHWC or NCHW, for the gradient.
 *@li x: A 4D Tensor of type float16 or float32, with format NHWC or NCHW.
 *@li scale: A 4D Tensor of type float32, with format NHWC or NCHW.
-*@li reserve_space_1: A 4D Tensor of type float32, with format NHWC or NCHW. It 
-is an output of BatchNormExt2.
-*@li reserve_space_2: A 4D Tensor of type float32, with format NHWC or NCHW. It 
-is an output of BatchNormExt2 . \n
+*@li reserve_space_1: A 4D Tensor of type float32, with format NHWC or NCHW. It is an output of BatchNormExt2.
+*@li reserve_space_2: A 4D Tensor of type float32, with format NHWC or NCHW. It is an output of BatchNormExt2 . \n
 
 *@par Attributes:
 *@li epsilon: A required float32. A small float number added to the variance of "x".
 *@li data_format: A required string for the format.
-*@li is_training: A required bool for specifying the operation is for training 
-(true) or inference (false) . \n
+*@li is_training: A required bool for specifying the operation is for training (true) or inference (false) . \n
 
 *@par Outputs:
-*@li x_backprop: A Tensor of type float16 or float32, with format NHWC or NCHW, 
-for the offset of "x".
-*@li scale_backprop: A Tensor of type float32, with format NHWC or NCHW, for 
-the offset of "scale".
-*@li offset_backprop: A Tensor of type float32, with format NHWC or NCHW, for 
-the offset of "offset".
+*@li x_backprop: A Tensor of type float16 or float32, with format NHWC or NCHW, for the offset of "x".
+*@li scale_backprop: A Tensor of type float32, with format NHWC or NCHW, for the offset of "scale".
+*@li offset_backprop: A Tensor of type float32, with format NHWC or NCHW, for the offset of "offset".
 *@li reserve_space_3: A Tensor of type float32, with format NHWC or NCHW.
 *@li reserve_space_4: A Tensor of type float32, with format NHWC or NCHW . \n
 
@@ -331,18 +290,14 @@ REG_OP(BatchNormGradExt2)
 *@brief Performs batch normalization . \n
 
 *@par Inputs:
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW 
-for 4D or NC1HWC0 for 5D.
-*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  
-Specifies the mean used for inference.
-*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  
-Specifies the variance used for inference.
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  Specifies the mean used for inference.
+*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  Specifies the variance used for inference.
 *@li momentum: A Tensor,represents the mean and the variance's scale factor
 *@li scale: An optional tensor of type float16 or float32, no use
 *@li offset: An optional tensor of type float16 or float32, no use
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance 
-to avoid dividing by zero. Defaults to "0.00001".
+*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional input, not use
 *@par Outputs:
@@ -360,20 +315,16 @@ REG_OP(BNInference)
     .ATTR(use_global_stats, Bool,true)
     .ATTR(mode, Int,1)
     .OP_END_FACTORY_REG(BNInference)
-
 /**
 *@brief aicpu batch normalization host  . \n
 
 *@par Inputs:
 
-*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  
-Specifies the mean used for inference.
-*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  
-Specifies the variance used for inference.
+*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  Specifies the mean used for inference.
+*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  Specifies the variance used for inference.
 *@li momentum: An optional float, mean and variance's Scale factor
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance 
-to avoid dividing by zero. Defaults to "0.00001".
+*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional attr, not use
 *@par Outputs:
@@ -397,19 +348,14 @@ REG_OP(BnHost)
 *@brief Performs batch normalization . \n
 
 *@par Inputs:
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW 
-for 4D or NC1HWC0 for 5D.
-*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" 
-Specifies the mean used for inference.
-*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" 
-Specifies the variance used for inference.
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" Specifies the mean used for inference.
+*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" Specifies the variance used for inference.
 *@li scale: An optional tensor of type float16 or float32, no use
 *@li offset: An optional tensor of type float16 or float32, no use
 *@par Attributes:
-*@li momentum: An optional float32 num, represents the mean and the variance's 
-scale factor
-*@li epsilon: An optional float32, specifying the small value added to variance 
-to avoid dividing by zero. Defaults to "0.00001".
+*@li momentum: An optional float32 num, represents the mean and the variance's scale factor
+*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional attr, not use
 *@par Outputs:
@@ -432,4 +378,4 @@ REG_OP(BNInferenceD)
 
 }  // namespace ge
 
-#endif  // GE_OP_NN_BATCH_NORM_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_BATCH_NORM_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
index 6307889d..c5a6a523 100644
--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -18,8 +18,8 @@
  * \file nn_calculation_ops.h
  * \brief
  */
-#ifndef GE_OP_NN_CALCULATION_OPS_H
-#define GE_OP_NN_CALCULATION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_CALCULATION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_CALCULATION_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -636,7 +636,7 @@ REG_OP(Conv2DBackpropFilterD)
 *@verbatim
     |Name             | Field    | Scope
     ------------------|----------|----------
-    |Input Image Size | H        | [1, 4096]
+    |Input Image Size | H        | [1, 100000]
     |                 | W        | [1, 4096]
     ------------------|----------|----------
     |Filter Size      | H        | [1, 255]
@@ -766,6 +766,122 @@ REG_OP(Conv2DCompress)
     .ATTR(offset_x, Int, 0)
     .OP_END_FACTORY_REG(Conv2DCompress)
 
+/**
+*@brief Computes a 2D convolution given 4D "x", "filter" and "offsets"
+* tensors.
+*@par Inputs:
+* @li x: A 4D tensor of input images. With shape of
+* [batch, in_height, in_width, in_channels] when format is "NHWC".
+* @li filter: A 4D tensor of filters. Must have the same type as "x". With
+* shape of [filter_height, filter_width, in_channels, out_channels] when format
+*  is "HWCN".
+* @li offsets: A 4D tensor of offsets. With shape of
+* [batch, deformable_groups * filter_height * filter_width * 3, in_height,
+*  in_width] when format is "NCHW".
+* @li bias: An optional 1D tensor. Shape is [out_channels].
+*
+* The input and output tensor attributes are listed as follows:
+* @verbatim
+    |Tensor    | x       | filter  | offsets | bias     | y
+    -----------|---------|---------|---------|----------|--------
+    |Data Type | float16 | float16 | float16 | float16  | float16
+    -----------|---------|---------|---------|----------|--------
+    |Format    | NCHW    | NCHW    | NCHW    | ND       | NCHW
+    |          | NHWC    | HWCN    |         |          | NHWC
+@endverbatim
+* It should be noted that the data types must correspond to each other, but
+* the format does not need to.
+
+*@par Attributes:
+* @li strides: Required. A list of 4 integers. Specifying the strides of the
+* convolution along the height and width. The dimension order is determined
+* by the data format of "x". By default the N and C dimensions are set to 1.
+* @li pads: Required. A list of 4 integers. Specifying the top, bottom, left
+* and right padding.
+* @li dilations: Optional. A list of 4 integers. Specifying the dilation rate
+* to use for dilated convolution. Has the same dimension order and value as
+* "strides".
+* @li groups: Optional. Number of blocked connections from input channels to
+* output channels. Input channels and output channels must both be divisible
+* by "groups".Type is int32.
+* @li data_format: Optional. An optional string from: "NHWC", "NCHW". Specifying the
+* data format of the input and output images. Type is string. Defaults to
+* "NHWC". Reserved.
+* @li deformable_groups: Optional. Cut the c chanel of input X into deformable_groups,
+* each share a different offsets. Input channels must be divisible by
+* "deformable_groups". Type is int32.
+
+*@par Outputs:
+* @li y: A 4D Tensor of output images. Must have the same type and format as
+* "x". With shape of [batch, out_channels, out_height, out_width] when format
+* is "NHWC".
+* @li output_height = (in_height + top_pad + botton_pad -
+* dilation_h * (filter_height - 1) -1) / stride_h + 1
+* @li output_width = (in_width + left_pad + right_pad -
+* dilation_w * (filter_width - 1) -1) / stride_w + 1
+
+*@attention
+* @li The parameter scope is listed as follows:
+* @verbatim
+    |Name             | Field        | Scope
+    ------------------|--------------|----------------------------------------
+    |Input Image Size | H dimension  | 1 <= in_height * filter_height <= 4096
+    |                 | W dimension  | 1 <= in_width * filter_width <=4096
+    ------------------|--------------|----------------------------------------
+    |Filter Size      | H dimension  | [1, 255]
+    |                 | W dimension  | [1, 255]
+    ------------------|--------------|----------------------------------------
+    |offsets Size     | C dimension  | offsets_c = deformable_groups *
+    |                 |              |  filter_width * filter_height * 3
+    |                 | H dimension  | the same as output H dimension
+    |                 | W dimension  | the same as output W dimension
+    ------------------|--------------|----------------------------------------
+    |Stride Size      | H dimension  | [1, 63]
+    |                 | W dimension  | [1, 63]
+    ------------------|--------------|----------------------------------------
+    |Padding Size     | top side     | [0, 255]
+    |                 | bottom side  | [0, 255]
+    |                 | left side    | [0, 255]
+    |                 | right side   | [0, 255]
+    ------------------|--------------|----------------------------------------
+    |Dilation Size    | H dimension  | [1, 255]
+    |                 | W dimension  | [1, 255]
+@endverbatim
+
+* @li There are restrictions for certain scenarios:
+* @verbatim
+    | Output           | Restrictions
+    -------------------|---------------------------
+    | W dimension == 1 | HxW(input) == HxW(filter)
+    | H dimension == 1 |
+    -------------------|---------------------------
+    | W dimension == 1 | Not supported
+    | H dimension != 1 |
+@endverbatim
+* As shown above, "HxW(input)" indicates the image size after padding and
+* "HxW(filter)" indicates the filter size after dilation.
+
+*@par Quantization supported or not
+* Yes
+
+*@par Third-party framework compatibility
+*@li Compatible with the TensorFlow operator "conv2d".
+*@li Compatible with the Caffe operator 2D "Convolution".
+*/
+REG_OP(DeformableConv2D)
+    .INPUT(x, TensorType({DT_FLOAT16}))
+    .INPUT(filter, TensorType({DT_FLOAT16}))
+    .INPUT(offsets, TensorType({DT_FLOAT16}))
+    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16}))
+    .OUTPUT(y, TensorType({DT_FLOAT16}))
+    .REQUIRED_ATTR(strides, ListInt)
+    .REQUIRED_ATTR(pads, ListInt)
+    .ATTR(dilations, ListInt, {1, 1, 1, 1})
+    .ATTR(groups, Int, 1)
+    .ATTR(data_format, String, "NHWC")
+    .ATTR(deformable_groups, Int, 1)
+    .OP_END_FACTORY_REG(DeformableConv2D)
+
 /**
 *@brief Computes a 3D convolution given 5D "x" and "filter" tensors.
  *@par Inputs:
@@ -1275,5 +1391,39 @@ REG_OP(Conv2DTransposeD)
     .ATTR(offset_x, Int, 0)
     .OP_END_FACTORY_REG(Conv2DTransposeD)
 
+/**
+*@brief In the deformable convolution operator, the original input FeatureMap is expanded to a ksize_y * H * ksize_x *W
+*FeatureMap by bilinear interpolation according to the offset offset.
+*@par Inputs:
+ * Four inputs:
+ * @li x: A Tensor of type float16
+ * @li offsets: A Tensor of type float16,float32.Deformation offset parameter.
+*@par Required Attributes:
+ * @li strides: A tuple/list of 2 integers.The stride of the sliding window for
+ * height and width for H/W dimension.
+ * @li pads: A tuple/list of 4 integers.Padding added to each dimension
+ * of the input.
+ * @li ksize: A tuple/list of 2 integers.kernel size.
+*@par Attributes:
+ * Three attributes:
+ * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension
+ * of input.  Defaults to [0, 0, 0, 0]
+ * @li data_format: An optional string from: "NCHW", "NHWC". Defaults to "NCHW". Specify the data format of the input x.
+ * @li deformable_groups: Specify the c-axis grouping number of input x.
+*@par Outputs:
+ * y: A Tensor. A Tensor of type float16.
+*/
+REG_OP(DeformableOffsets)
+    .INPUT(x, TensorType({DT_FLOAT16}))
+    .INPUT(offsets, TensorType({DT_FLOAT16, DT_FLOAT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16}))
+    .REQUIRED_ATTR(strides, ListInt)
+    .REQUIRED_ATTR(pads, ListInt)
+    .REQUIRED_ATTR(ksize, ListInt)
+    .ATTR(dilations, ListInt, {0, 0, 0, 0})
+    .ATTR(data_format, String, "NCHW")
+    .ATTR(deformable_groups, Int, 1)
+    .OP_END_FACTORY_REG(DeformableOffsets)
+
 }  // namespace ge
-#endif  // GE_OP_NN_CALCULATION_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_CALCULATION_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
index bd8bb9bf..a013fb33 100644
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -18,8 +18,8 @@
  * \file nn_detect_ops.h
  * \brief
  */
-#ifndef GE_OP_NN_DETECT_OPS_H_
-#define GE_OP_NN_DETECT_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -158,25 +158,18 @@ REG_OP(Iou)
 *@par Inputs:
 * Three inputs, including:
 *@li ydiff: A 5HD gradient input of type float32.
-*@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" 
-indicates the number of ROIs,
-the value "5" indicates the indexes of images where the ROIs are located, "x0", 
-"x1", "y0", and "y1".
-*@li rois_n: An optional input, specifying the number of valid ROIs. This 
-parameter is reserved . \n
+*@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" indicates the number of ROIs,
+the value "5" indicates the indexes of images where the ROIs are located, "x0", "x1", "y0", and "y1".
+*@li rois_n: An optional input, specifying the number of valid ROIs. This parameter is reserved . \n
 
 *@par Attributes:
 *@li xdiff_shape: A required list of 4 ints, obtained based on the shape of "features" of ROIAlign.
 *@li pooled_width: A required attribute of type int, specifying the W dimension.
 *@li pooled_height: A required attribute of type int, specifying the H dimension.
-*@li spatial_scale: A required attribute of type float, specifying the scaling 
-ratio of "features" to the original image.
-*@li sample_num: An optional attribute of type int, specifying the horizontal 
-and vertical
-sampling frequency of each output. If this attribute is set to "0", the 
-sampling frequency is
-equal to the rounded up value of "rois", which is a floating point number. 
-Defaults to "2" . \n
+*@li spatial_scale: A required attribute of type float, specifying the scaling ratio of "features" to the original image.
+*@li sample_num: An optional attribute of type int, specifying the horizontal and vertical
+sampling frequency of each output. If this attribute is set to "0", the sampling frequency is
+equal to the rounded up value of "rois", which is a floating point number. Defaults to "2" . \n
 
 *@par Outputs:
 *xdiff: Gradient added to input "features". Has the same 5HD shape as input "features".
@@ -525,11 +518,11 @@ as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn . \n
 
 *@par Outputs:
 *@li coord_data: A float16 or float32 with shape [N, boxes*coords, ceilx(height*width*2+32, 32)/2],
-where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box.
+* where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box.
 *@li obj_prob: A float16 or float32 with shape [N, ceilx(boxes*height*width *2+32, 32)/2],
-where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence.
+* where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence.
 *@li classes_prob: A float16 or float32 with shape [N, classes, ceilx(boxes*height*width *2+32, 32)/2],
-where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes . \n
+* where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes . \n
 
 *@attention Constraints:
 *@li This operator applies to YOLO v2 and v3 networks.
@@ -557,9 +550,9 @@ REG_OP(Yolo)
 *@par Inputs:
 * Four inputs, including:
 *@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov3DetectionOutput.
-Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+* Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
 *@li img_info: A float16 or float32, describing the image information including the required image height and width
-and the actual image height and width.
+* and the actual image height and width.
 *
 *@par Attributes:
 *@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
@@ -568,7 +561,7 @@ and the actual image height and width.
 *@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 20].
 *@li relative: An optional bool. Defaults to and must be "true".
 *@li obj_threshold: A required float, specifying the confidence threshold for box filtering,
-which is the output "obj" of operator Yolo). The value range is [0.0, 1.0] . \n
+* which is the output "obj" of operator Yolo). The value range is [0.0, 1.0] . \n
 
 *@li post_nms_topn: An optional int32. This attribute is reserved.
 *@li score_threshold: A required float, specifying the class score threshold for box filtering,
@@ -615,11 +608,11 @@ REG_OP(YoloV2DetectionOutput)
 *@par Inputs:
 *Six inputs, including:
 *@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov2DetectionOutput.
-Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+* Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
 *@li imginfo: A float16, describing the image information including the required image height and width
-and the actual image height and width.
+* and the actual image height and width.
 *@li windex: A windex tensor with shape [height, weight]. Has the same type as the inputs.
-[[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed.
+* [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed.
 
 *@li hindex: A hindex tensor with shape [height, weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]].
 
@@ -680,10 +673,10 @@ REG_OP(YoloV2DetectionOutputD)
 *@par Inputs:
 *Ten inputs, including:
 *@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class".
-There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo.
+* There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo.
 *@li img_info: A float16 or float32, describing the image information including the required image height and width
-and the actual image height and width.
-*
+* and the actual image height and width.
+
 *@par Attributes:
 *@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
 *@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
@@ -698,13 +691,13 @@ and the actual image height and width.
 *@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].
 
 *@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "512".
-*
+
 *@par Outputs:
 *@li boxout: A tensor of type float16 or float32 with shape [batch,6*post_nms_topn], describing the information of each output box.
 * In output shape, 6 means x1, y1, x2, y2, score, label(class). Output by the number of box_out_num.
 *@li boxoutnum: A tensor of type int32 with shape [batch,8], specifying the number of output boxes.
 * The output shape means only the first one of the 8 numbers is valid, the number of valid boxes in each batch, the maximum number of valid boxes in each batch is 1024
-*
+
 *@attention Constraints:
 *@li This operator applies only to the YOLO v3 network.
 *@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators . \n
@@ -746,16 +739,15 @@ REG_OP(YoloV3DetectionOutput)
 *@par Inputs:
 *16 Input, including:
 *@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput.
-A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+* A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
 *@li imginfo: A float16, describing the image information including the required image height and width
-and the actual image height and width.
+* and the actual image height and width.
 *@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs.
-[[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively . \n
+* [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively . \n
 
 *@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs.
-[[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively . \n
-
-*
+* [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively . \n
+s
 *@par Attributes:
 *@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
 *@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
@@ -767,13 +759,13 @@ and the actual image height and width.
 *@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
 *@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].
 *@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "512".
-*
+
 *@par Outputs:
 *@li boxout: A tensor of type float16 or float32 with shape [batch,6*post_nms_topn], describing the information of each output box.
 * In output shape, 6 means x1, y1, x2, y2, score, label(class). Output by the number of box_out_num.
 *@li boxoutnum: A tensor of type int32 with shape [batch,8], specifying the number of output boxes.
 * The output shape means only the first one of the 8 numbers is valid, the number of valid boxes in each batch, the maximum number of valid boxes in each batch is 1024
-*
+
 *@attention Constraints:
 *@li This operator applies only to the YOLO v3 network.
 *@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators.
@@ -824,8 +816,8 @@ REG_OP(YoloV3DetectionOutputD)
 *@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class". \n
 There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo.
 *@li img_info: A float16 or float32, describing the image information including the required image height and width \n
-and the actual image height and width.
-*
+* and the actual image height and width.
+
 *@par Attributes:
 *@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
 *@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
@@ -840,13 +832,13 @@ and the actual image height and width.
 *@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
 
 *@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "512".
-*
+
 *@par Outputs:
 *@li boxout: A tensor of type float16 or float32 with shape [batch,6,post_nms_topn](out_box_dim == 3) or [batch, 6*post_nms_topn](out_box_dim == 2),
 * In output shape, 6 means x1, y1, x2, y2, score, label(class). Output by the number of box_out_num.
 *@li boxoutnum: A tensor of type int32 with shape [batch,8], specifying the number of output boxes.
 * The output shape means only the first one of the 8 numbers is valid, the number of valid boxes in each batch, the maximum number of valid boxes in each batch is 1024
-*
+
 *@attention Constraints:\n
 *@li This operator applies only to the YOLO v3 network.
 *@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators.
@@ -875,19 +867,19 @@ REG_OP(YoloV3DetectionOutputV2)
     .OP_END_FACTORY_REG(YoloV3DetectionOutputV2)
 
 /**
-*@brief Performs YOLO V3 detection . \n
+*@brief Performs YOLO V3 detection.
 
 *@par Inputs:
 *16 Input, including:
 *@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput.
-A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+* A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
 *@li imginfo: A float16, describing the image information including the required image height and width
-and the actual image height and width.
-*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively . \n
+* and the actual image height and width.
+*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs.
+* [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)]
+* is formed for the three Yolo outputs, respectively .It's a dynamic input. \n
 
 *@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively . \n
-
-*
 *@par Attributes:
 *@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
 *@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
@@ -902,6 +894,7 @@ and the actual image height and width.
 *
 *@par Outputs:
 *@li boxout: A tensor of type float16 or float32 with shape [batch,6,post_nms_topn](out_box_dim == 3) or [batch, 6*post_nms_topn](out_box_dim == 2),
+*            describing the information of each output box.
 * In output shape, 6 means x1, y1, x2, y2, score, label(class). Output by the number of box_out_num.
 *@li boxoutnum: A tensor of type int32 with shape [batch,8], specifying the number of output boxes.
 * The output shape means only the first one of the 8 numbers is valid, the number of valid boxes in each batch, the maximum number of valid boxes in each batch is 1024
@@ -912,6 +905,9 @@ and the actual image height and width.
 *@see Yolo()
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV3DetectionOutputV2 instead.
 */
 REG_OP(YoloV3DetectionOutputV2D)
     .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -1028,15 +1024,15 @@ REG_OP(ROIPooling)
 
 /**
 *@brief Computes decode bbox function.
-*
+
 *@par Inputs:
 *Inputs include:
 * @li box_predictions: A Tensor. Must be float16.
 * @li anchors: A Tensor. Must have the same type as box_predictions.
-*
+
 *@par Attributes:
 * @ decode_clip: required, float, threahold of decode process.
-*
+
 *@par Outputs:
 * @ decoded_boxes: A Tensor. Must have the same type as box_predictions.
 *                    N-D with shape [N, 4].
@@ -1207,12 +1203,12 @@ REG_OP(RpnProposalsD)
 
 /**
 *@brief Computes Score Filte Pre-Sort function.
-*
+
 *@par Inputs:
 *Inputs include:
 * @li rois: A Tensor. Must be float16. N-D with shape [N, 4].
 * @li cls_bg_prob: A Tensor. Must be float16. N-D with shape [N, 1].
-*
+
 *@par Attributes:
 * @li score_threshold: required, float, threahold of topk process.
 * @li k: required, Int, threahold of topk process.
@@ -1273,12 +1269,12 @@ REG_OP(RpnProposalPostProcessing)
     .OP_END_FACTORY_REG(RpnProposalPostProcessing)
 /**
 *@brief Computes DecodeBoundariesTarget function.
-*
+
 *@par Inputs:
 *Inputs include:
 * @li boundary_predictions: A Tensor. Must be float16.
 * @li anchors: A Tensor. Must be float16.
-*
+
 *@par Outputs:
 * @ boundary_encoded: A Tensor. Must be float16.
 
@@ -1476,7 +1472,21 @@ REG_OP(DecodeBboxV2)
     .ATTR(reversed_box, Bool, false)
     .OP_END_FACTORY_REG(DecodeBboxV2)
 
-
+/**
+*@brief Computes sort function.
+*
+*@par Inputs:
+*Inputs include:
+* x: A Tensor. Must be float16 or float32.
+*
+*@par Attributes:
+* @li axis: optional, int.
+* @li descending: optional,bool.
+*
+*@par Outputs:
+* @li y1: A Tensor. Must have the same type as x.
+* @li y2: A Tensor. Indices of y1 in x.Dtype must be int32.
+*/
 REG_OP(Sort)
     .INPUT(x, TensorType({ DT_FLOAT16 }))
     .OUTPUT(y1, TensorType({ DT_FLOAT16 }))
@@ -1485,7 +1495,6 @@ REG_OP(Sort)
     .ATTR(descending, Bool, false)
     .OP_END_FACTORY_REG(Sort)
 
-
 }  // namespace ge
 
-#endif  // GE_OP_NN_DETECT_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
index 0d0032cf..35c4c7d4 100644
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -18,8 +18,8 @@
  * \file nn_norm_ops.h
  * \brief
  */
-#ifndef GE_OP_NN_NORM_OPS_H
-#define GE_OP_NN_NORM_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_NORM_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_NORM_OPS_H_
 
 #include "graph/operator_reg.h"
 namespace ge {
@@ -159,6 +159,34 @@ REG_OP(SigmoidCrossEntropyWithLogits)
     .OUTPUT(loss, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogits)
 
+/**
+*@brief Computes the sigmoid cross entropy loss of "predict" and "target" . \n
+
+*@par Inputs:
+* four inputs, including:
+*@li predict: A multi-dimensional Tensor of type float16 or float32, specifying the predictive value.
+*@li target: A multi-dimensional Tensor of type float16 or float32, specifying the target value . \n
+*@li weight: An multi-dimensional Tensor, specifying the weight value. \n
+*@li pos_weight: An multi-dimensional Tensor, specifying the pos weight value. \n
+
+*@par Attributes:
+*reduction: A character string from "none", "mean", and "sum", specifying the reduction type to be applied to the output. Defaults to "mean" . \n
+
+*@par Outputs:
+*loss: Sigmoid cross entropy between the predictive value and target value. Has the same dimensions as "predict" . \n
+
+*@par Third-party framework compatibility
+* Compatible with PyTorch operator BCEWithLogitsLoss.
+*/
+REG_OP(SigmoidCrossEntropyWithLogitsV2)
+    .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(target, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(pos_weight, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(loss, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(reduction, String, "mean")
+    .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogitsV2)
+
 /**
 *@brief Computes the regression box of the RPN. It is a FasterRCNN operator . \n
 
@@ -896,7 +924,29 @@ REG_OP(InstanceNormV2)
     .ATTR(epsilon, Float, 0.00001)
     .OP_END_FACTORY_REG(InstanceNormV2)
 
+/**
+*@brief Performs instance normalization for inference.
+
+*@par Inputs:\n
+* Five inputs, including: (NC1HWC0 supported)
+*@li x: A Tensor of type float16 or float32.
+*@li gamma: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling gamma.
+*@li beta: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling beta.
+*@li mean: A [N, C1, 1, 1, C0] ensor of type float32, for the mean.
+*@li variance: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance.
+*@li variance_sqrt: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance_sqrt.
+
+*@par Outputs:\n
+*y: A Tensor of type float16 or float32 for the normalized "x".
+*batch_mean: A Tensor of type float32 for the result mean.
+*batch_ variance: A Tensor of type float32 for the result variance.
 
+*@attention Constraints:
+*For Ascend 310, the result accuracy fails to reach 1<89> due to the square root instruction.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use INInferV2 instead.
+*/
 REG_OP(INInferV2D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OPTIONAL_INPUT(gamma, TensorType({DT_FLOAT}))
@@ -930,4 +980,4 @@ REG_OP(InHost)
      .OP_END_FACTORY_REG(InHost)
 }  // namespace ge
 
-#endif  //GE_OP_NN_NORM_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_NORM_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_ops.h b/third_party/fwkacllib/inc/ops/nn_ops.h
index ea4a5ba3..9edc469a 100644
--- a/third_party/fwkacllib/inc/ops/nn_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_ops.h
@@ -18,9 +18,9 @@
  * \file nn_ops.h
  * \brief
  */
-#ifndef GE_OP_NN_OPS_H_
-#define GE_OP_NN_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
 
 #include "nn_pooling_ops.h"
 
-#endif  // GE_OP_NN_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
index fb7fc127..d72da385 100644
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -18,8 +18,8 @@
  * \file nn_pooling_ops.h
  * \brief
  */
-#ifndef GE_OP_NN_POOLING_OPS_H
-#define GE_OP_NN_POOLING_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_POOLING_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_POOLING_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -31,7 +31,7 @@ namespace ge {
 *@par Inputs:
 *@li x: An NCHW tensor of type float16, float32, int8.
 *@par Attributes:
-*@li mode: An optional int32, specifying the pooling algorithm, either "1" (max pooling) or "0" (avg pooling). Defaults to "0".
+*@li mode: An optional int32, specifying the pooling algorithm, either "0" (max pooling) or "1" (avg pooling). Defaults to "0".
 *@li global_pooling: An optional bool. Defaults to "false".
 *@li window: Optional, including:
 *window[0]: An optional int32, specifying the window size along in the H dimension. The value range is [1, 32768]. Defaults to "1".
@@ -70,6 +70,7 @@ REG_OP(Pooling)
     .ATTR(pad, ListInt, {0,0,0,0})      // pad size
     .ATTR(dilation, ListInt, {1,1,1,1})
     .ATTR(ceil_mode, Int, 0)
+    .ATTR(data_format, String, "NCHW")
     .OP_END_FACTORY_REG(Pooling)
 
 /**
@@ -79,7 +80,7 @@ REG_OP(Pooling)
 *x: A tensor of type float16, float32, double . \n
 
 *@par Attributes:
-*@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window, where N = C = 1, and H and W are positive integers within the range [1, 32768].
+*@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window, where N = C = 1, and H and W are positive integers within the range [1, 255].
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimensions are 1. The strides of the H and W dimensions are positive integers within the range [1, 63].
 *@li padding: A required string, specifying the padding algorithm, either "VALID" or "SAME". With "SAME" means that the outputs will have the same spatial dimensions as its inputs. With "VALID" means no padding.
 *@li data_format: An optional string, specifying the data format of "ksize" and "strides", either "NCHW", "NC1HWC0", or "NHWC" (default) . \n
@@ -91,7 +92,7 @@ REG_OP(Pooling)
 *@li This operator applies only to a TensorFlow network.
 *@li Only single input and single output are supported.
 *@li Global pooling is supported.
-*@li "ksize_H" and "ksize_W" are positive integers within the range [1, 32768]. ksize_H * ksize_W < 256
+*@li "ksize_H" and "ksize_W" are positive integers within the range [1, 255]. ksize_H * ksize_W < 256
 *@li Due to instruction restrictions, the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator AvgPool.
@@ -106,10 +107,50 @@ REG_OP(AvgPool)
     .OP_END_FACTORY_REG(AvgPool)
 
 /**
-*@brief Performs average pooling on the input . \n
+*@brief Performs average pooling on the input.
 
 *@par Inputs:
-*x: A 5-D Tensor of shape [batch, depth, height, width, channels] and type float16, float32, double . \n
+*x: A tensor of type float16, float32, double.
+
+*@par Attributes:
+*@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window, where N = C = 1, and H and W are positive integers within the range [1, 255].
+*@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimensions are 1. The strides of the H and W dimensions are positive integers within the range [1, 63].
+*@li padding_mode: A required string, specifying the padding algorithm, either "VALID", "SAME" and "CALCULATED". With "SAME" means that the outputs will have the same spatial dimensions as its inputs. With "VALID" means no padding.
+*@li pads: Pad value when padding_mode is "CALCULATED".
+*@li data_format: An optional string, specifying the data format of "ksize" and "strides", either "NCHW", "NC1HWC0", or "NHWC" (default).
+*@li global_pooling: Global or not. If true, pads will change to {0,0,0,0} and ksize will change to [input_h, input_w]
+*@li ceil_mode: Use ceil or floor to calculate the output size when padding_mode is "CALCULATED".
+*@li exclusive: Ignore padding area or not when calculating average.
+
+*@par Outputs:
+*y: The average pooled output tensor. Has the same type and format as input "x".
+
+*@attention Constraints:
+*@li Only single input and single output are supported.
+*@li Global pooling is supported.
+*@li "ksize_H" and "ksize_W" are positive integers within the range [1, 255]. ksize_H * ksize_W < 256
+*@li Due to instruction restrictions, the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
+*@par Third-party framework compatibility
+* Compatible with the TensorFlow operator AvgPoolV2.
+*/
+REG_OP(AvgPoolV2)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .REQUIRED_ATTR(ksize, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .ATTR(padding_mode, String, "CALCULATED")
+    .ATTR(pads, ListInt, {0, 0, 0, 0})
+    .ATTR(data_format, String, "NCHW")
+    .ATTR(global_pooling, Bool, false)
+    .ATTR(ceil_mode, Bool, false)
+    .ATTR(exclusive, Bool, true)
+    .OP_END_FACTORY_REG(AvgPoolV2)
+
+/**
+*@brief Performs average pooling on the input.
+
+*@par Inputs:
+*x: A 5-D Tensor of shape [batch, depth, height, width, channels] and type float16, float32, double.
 
 *@par Attributes:
 *@li ksize: List of ints that has length 1, 3 or 5. The size of the window for each dimension of the input tensor.
@@ -185,15 +226,15 @@ REG_OP(MaxPoolExt2)
 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int8, int16,
- * int32, int64, uint8, uint16, qint8
+* int32, int64, uint8, uint16, qint8
 
 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values,
- * specifying the size of the window for each dimension of the input tensor.
- * No default value.
+* specifying the size of the window for each dimension of the input tensor.
+* No default value.
 *@li strides: A required list of int8, int16, int32, or int64 values,
- * specifying the stride of the sliding window for each dimension of
- * the input tensor. No default value.
+* specifying the stride of the sliding window for each dimension of
+* the input tensor. No default value.
 *@li padding: A required string. No default value.
 *@li data_format: An optional string. Defaults to "NHWC" . \n
 
@@ -202,9 +243,9 @@ REG_OP(MaxPoolExt2)
 
 *@attention Constraints:
 *@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1,
- * ksize[1] * ksize[2] <= 255.
+* ksize[1] * ksize[2] <= 255.
 *@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1,
- * strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
+* strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
 *@li "padding" is either "SAME" or "VALID".
 
 
@@ -626,7 +667,7 @@ REG_OP(AvgPoolGrad)
 * @par Inputs:
 * @input_grad: An NHWC tensor of type float16.
 * @mean_matrix: Assist matrix, an NHWC tensor of type float16.
-* @kernel_matrix: Assist matrix, an NHWC tensor of type float16. \n
+* @kernel_matrix: Assist matrix, an NHWC tensor of type float16.
 
 * @par Attributes:
 * @li orig_input_shape: A required Original input dimensions.
@@ -656,6 +697,88 @@ REG_OP(AvgPoolGradD)
     .ATTR(data_format, String, "NHWC")
     .OP_END_FACTORY_REG(AvgPoolGradD)
 
+/**
+* @brief Computes avgpoolv2grad function.
+
+* @par Inputs:
+* @li orig_input_shape: An NHWC tensor of type int32.
+* @li input_grad: An NHWC tensor of type float16, float32, or double.
+
+* @par Attributes:
+* @li ksize: A required tuple or list, specifying the size of the window for
+* each dimension of the input tensor.
+* @li strides: A required tuple or list, specifying the stride of the sliding
+* window for each dimension of the input tensor.
+* @li padding_mode: A required string, specifying the type of
+* the padding algorithm to use.
+* @li global_pooling: Whether to use the global pooling. If global_pooling=true,
+* ksize and pads will be ignored. Default False.
+* @li ceil_mode: Whether to use the ceil function to calculate output height and
+* width. Default False.
+* @li exclusive: Whether to exclude padding points. default is true.
+* @li data_format: An optional string. Defaults to "NHWC".
+
+* @par Outputs:
+* @out_grad: A mutable tensor with the same shape and type as "orig_input".
+
+* @par Third-party framework compatibility
+* @li Compatible with the TensorFlow operator AvgPoolGrad.
+*/
+REG_OP(AvgPoolV2Grad)
+    .INPUT(orig_input_shape, TensorType({DT_INT32}))
+    .INPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
+    .OUTPUT(out_grad, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
+    .REQUIRED_ATTR(ksize, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .ATTR(padding_mode, String, "CALCULATED")
+    .ATTR(pads, ListInt, {0,0,0,0})
+    .ATTR(data_format, String, "NCHW")
+    .ATTR(global_pooling, Bool, false)
+    .ATTR(ceil_mode, Bool, false)
+    .ATTR(exclusive, Bool, true)
+    .OP_END_FACTORY_REG(AvgPoolV2Grad)
+/**
+* @brief Computes gradients of averagev2 pooling function.
+
+* @par Inputs:
+* @li input_grad: An NHWC tensor of type float16, float32, or double.
+
+* @par Attributes:
+* @li orig_input_shape: A required tuple or list of type int32.
+* @li ksize: A required tuple or list, specifying the size of the window for
+* each dimension of the input tensor.
+* @li strides: A required tuple or list, specifying the stride of the sliding
+* window for each dimension of the input tensor.
+* @li padding_mode: A required string, specifying the type of
+* the padding algorithm to use.
+* @li global_pooling: Whether to use the global pooling. If global_pooling=true,
+* ksize and pads will be ignored. Default False.
+* @li ceil_mode: Whether to use the ceil function to calculate output height and
+* width. Default False.
+* @li exclusive: Whether to exclude padding points. default is true.
+* @li data_format: An optional string. Defaults to "NHWC".
+
+* @par Outputs:
+* @out_grad: A mutable tensor with the same shape and type as "orig_input".
+
+* @par Third-party framework compatibility
+* @li Compatible with the TensorFlow operator AvgPoolGrad.
+*/
+REG_OP(AvgPoolV2GradD)
+    .INPUT(input_grad, TensorType({DT_FLOAT16}))
+    .OPTIONAL_INPUT(mean_matrix, TensorType({DT_FLOAT16}))
+    .OPTIONAL_INPUT(kernel_matrix, TensorType({DT_FLOAT16}))
+    .OUTPUT(out_grad, TensorType({DT_FLOAT16}))
+    .REQUIRED_ATTR(orig_input_shape, ListInt)
+    .REQUIRED_ATTR(ksize, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .ATTR(padding_mode, String, "CALCULATED")
+    .ATTR(pads, ListInt, {0,0,0,0})
+    .ATTR(data_format, String, "NCHW")
+    .ATTR(global_pooling, Bool, false)
+    .ATTR(ceil_mode, Bool, false)
+    .ATTR(exclusive, Bool, true)
+    .OP_END_FACTORY_REG(AvgPoolV2GradD)
 
 /**
 *@brief :upsample the layer
@@ -1065,6 +1188,108 @@ REG_OP(MaxPoolGradWithArgmaxV2)
     .ATTR(dilation, ListInt, {1,1,1,1})
     .ATTR(ceil_mode, Bool, false)
     .OP_END_FACTORY_REG(MaxPoolGradWithArgmaxV2)
-}  // namespace ge
 
-#endif  // GE_OP_NN_POOLING_OPS_H
+/**
+* @brief Performs max pooling on the input . \n
+
+* @par Inputs:
+* One input:
+* x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int8, int16,
+* int32, int64, uint8, uint16, qint8
+
+* @par Attributes:
+* @li ksize: A required list of int8, int16, int32, or int64 values,
+* specifying the size of the window for each dimension of the input tensor.
+* No default value.
+* @li strides: A required list of int8, int16, int32, or int64 values,
+* specifying the stride of the sliding window for each dimension of
+* the input tensor. No default value.
+* @li padding_mode: A required string. Defaults to "CALCULATED".
+* @li pads:A required list of int8, int16, int32, or int64 values,
+* a data to caculate when padding_mode is "SAME" and "CALCULATED".
+* @li data_format: An optional string. Defaults to "NHWC" .
+* @li global_pooling bool, Whether to use the global pooling.
+* If global_pooling = true, kernel size and paddings will be ignored.
+* Default False
+* @li ceil_mode:global_pooling (bool) – (bool) Whether to use the global pooling.
+* If global_pooling = true, kernel size and paddings will be ignored.
+* Default False \n
+
+* @par Outputs:
+* y: A Tensor. Has the same type and format as input "x" . \n
+
+* @attention Constraints:
+* @li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1,
+* ksize[1] * ksize[2] <= 255.
+* @li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1,
+* strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
+* @li "padding" is  "SAME" "VALID" or "CACULATE" .
+
+
+* @par Third-party framework compatibility
+* Compatible with the TensorFlow operator MaxPool.
+*/
+REG_OP(MaxPoolV3)
+    .INPUT(x,TensorType({DT_FLOAT16, DT_FLOAT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32}))
+    .REQUIRED_ATTR(ksize, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .ATTR(padding_mode, String, "CALCULATED")
+    .ATTR(pads, ListInt, {0,0,0,0})
+    .ATTR(data_format, String, "NCHW")
+    .ATTR(global_pooling,Bool,false)
+    .ATTR(ceil_mode, Bool, false)
+    .OP_END_FACTORY_REG(MaxPoolV3)
+
+/**
+* @brief Computes gradients of the maxpooling function . \n
+
+* @par Inputs:
+* @li orig_input: A mutable NC1HWC0 tensor of type RealNumberType.
+* @li orig_output: A mutable NC1HWC0 tensor of type RealNumberTypex.
+* @li grad: A mutable NC1HWC0 tensor of type RealNumberType . \n
+
+* @par Attributes:
+* @li ksize: A required list of int8, int16, int32, or int64 values,
+* specifying the size of the window for each dimension of the input tensor.
+* No default value.
+* @li strides: A required list of int8, int16, int32, or int64 values,
+* specifying the stride of the sliding window for each dimension of
+* the input tensor. No default value.
+* @li padding_mode: A required string. Defaults to "CALCULATED".
+* @li pads:A required list of int8, int16, int32, or int64 values,
+* a data to caculate when padding_mode is "SAME" and "CALCULATED".
+* @li data_format: An optional string. Defaults to "NHWC" .
+* @li global_pooling bool, Whether to use the global pooling.
+* If global_pooling = true, kernel size and paddings will be ignored.
+* Default False
+* @li ceil_mode:global_pooling (bool) – (bool) Whether to use the global pooling.
+* If global_pooling = true, kernel size and paddings will be ignored.
+* Default False \n
+
+* @par Outputs:
+* y: A mutable tensor. Has the same shape and type as "x1" . \n
+
+* @attention Constraints:
+* @li Computing gradients of global pooling is not supported, which means
+* "ksize < x1".
+* @li "ksize" is in the range [1, 255]. "strides" is in the range [1, 63]
+
+* @par Third-party framework compatibility
+* Compatible with the TensorFlow operator MaxPoolGrad.
+*/
+REG_OP(MaxPoolV3Grad)
+    .INPUT(orig_input, TensorType::RealNumberType())
+    .INPUT(orig_output, TensorType::RealNumberType())
+    .INPUT(grad, TensorType::RealNumberType())
+    .OUTPUT(out_grad, TensorType::RealNumberType())
+    .REQUIRED_ATTR(ksize, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .ATTR(padding_mode, String, "CALCULATED")
+    .ATTR(pads, ListInt, {0, 0, 0, 0})
+    .ATTR(data_format, String, "NCHW")
+    .ATTR(global_pooling, Bool, false)
+    .ATTR(ceil_mode, Bool, false)
+    .OP_END_FACTORY_REG(MaxPoolV3Grad)
+}  // namespace ge
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_POOLING_OPS_H
diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h
index 0621a96c..047fd6da 100644
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -18,8 +18,8 @@
  * \file nn_training_ops.h
  * \brief
  */
-#ifndef GE_OP_TRAINING_OPS_H
-#define GE_OP_TRAINING_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_TRAINING_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NN_TRAINING_OPS_H_
 
 #include "graph/operator_reg.h"
 namespace ge {
@@ -111,6 +111,9 @@ REG_OP(ApplyAdaMax)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdaMax.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdaMax instead.
 */
 REG_OP(ApplyAdaMaxD)
     .INPUT(var, TensorType::NumberType())
@@ -349,6 +352,9 @@ REG_OP(ApplyMomentum)
 * accum: A mutable tensor. Has the same type as input "accum".
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyMomentum.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyMomentum instead.
 */
 
 REG_OP(ApplyMomentumD)
@@ -675,6 +681,9 @@ REG_OP(ApplyPowerSign)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyPowerSign.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyPowerSign instead.
 */
 REG_OP(ApplyPowerSignD)
     .INPUT(var, TensorType::NumberType())
@@ -795,6 +804,9 @@ REG_OP(ApplyAddSign)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ApplyAddSign.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAddSign instead.
 */
 REG_OP(ApplyAddSignD)
     .INPUT(var, TensorType::NumberType())
@@ -916,6 +928,9 @@ REG_OP(ApplyCenteredRMSProp)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyCenteredRMSPropD.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyCenteredRMSProp instead.
 */
 REG_OP(ApplyCenteredRMSPropD)
     .INPUT(var, TensorType::NumberType())
@@ -1034,6 +1049,9 @@ REG_OP(ApplyAdagrad)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdagrad.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdagrad instead.
 */
 REG_OP(ApplyAdagradD)
     .INPUT(var, TensorType::NumberType())
@@ -1218,6 +1236,9 @@ REG_OP(ApplyAdagradDA)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdagradDA.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdagradDA instead.
 */
 REG_OP(ApplyAdagradDAD)
     .INPUT(var, TensorType::NumberType())
@@ -1475,6 +1496,9 @@ REG_OP(ApplyProximalAdagrad)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyProximalAdagradD.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyProximalAdagrad instead.
 */
 REG_OP(ApplyProximalAdagradD)
     .INPUT(var, TensorType::NumberType())
@@ -1568,6 +1592,9 @@ REG_OP(SparseApplyProximalAdagrad)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator SparseApplyProximalAdagrad.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyProximalAdagrad instead.
 */
 REG_OP(SparseApplyProximalAdagradD)
     .INPUT(var, TensorType::NumberType())
@@ -1654,6 +1681,9 @@ REG_OP(ApplyFtrl)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyFtrl.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyFtrl instead.
 */
 REG_OP(ApplyFtrlD)
     .INPUT(var, TensorType::NumberType())
@@ -1745,6 +1775,9 @@ REG_OP(ApplyFtrlV2)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyFtrlV2.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyFtrlV2 instead.
 */
 REG_OP(ApplyFtrlV2D)
     .INPUT(var, TensorType::NumberType())
@@ -1857,6 +1890,9 @@ REG_OP(ApplyAdam)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdam.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdam instead.
 */
 REG_OP(ApplyAdamD)
     .INPUT(var, TensorType::NumberType())
@@ -1945,6 +1981,9 @@ REG_OP(ApplyAdadelta)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ApplyAdadelta.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdadelta instead.
 */
 REG_OP(ApplyAdadeltaD)
     .INPUT(var, TensorType::NumberType())
@@ -2556,4 +2595,4 @@ REG_OP(AtomicAddrClean)
     .OP_END_FACTORY_REG(AtomicAddrClean)
 }  // namespace ge
 
-#endif // GE_OP_TRAINING_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_TRAINING_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/no_op.h b/third_party/fwkacllib/inc/ops/no_op.h
index 503d97b1..7834591c 100644
--- a/third_party/fwkacllib/inc/ops/no_op.h
+++ b/third_party/fwkacllib/inc/ops/no_op.h
@@ -18,8 +18,8 @@
  * \file no_op.h
  * \brief
  */
-#ifndef GE_NO_OP_H_
-#define GE_NO_OP_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NO_OP_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NO_OP_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -38,4 +38,4 @@ REG_OP(NoOp)
 
 }  // namespace ge
 
-#endif  // GE_NO_OP_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NO_OP_H_
diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
index b50b7cd1..e0e5dfc6 100644
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -18,8 +18,8 @@
  * \file nonlinear_fuc_ops.h
  * \brief
  */
-#ifndef GE_OP_NONLINEAR_FUC_OPS_H
-#define GE_OP_NONLINEAR_FUC_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NONLINEAR_FUC_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NONLINEAR_FUC_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -642,4 +642,4 @@ REG_OP(Mish)
 
 } // namespace ge
 
-#endif // GE_OP_NONLINEAR_FUC_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NONLINEAR_FUC_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
index e94dafa7..8d7ef9f9 100644
--- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
+++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
@@ -18,9 +18,8 @@
  * \file npu_loss_scale_ops.h
  * \brief
  */
-
-#ifndef GE_OP_NN_LOSS_SCALE_OPS_H
-#define GE_OP_NN_LOSS_SCALE_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_NPU_LOSS_SCALE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_NPU_LOSS_SCALE_OPS_H_
 #include "graph/operator_reg.h"
 
 namespace ge {
@@ -120,4 +119,4 @@ REG_OP(NPUGetFloatStatus)
     .OP_END_FACTORY_REG(NPUGetFloatStatus)
 }  // namespace ge
 
-#endif  // GE_OP_NN_LOSS_SCALE_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_NPU_LOSS_SCALE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/outfeed_ops.h b/third_party/fwkacllib/inc/ops/outfeed_ops.h
index 139e4880..e0b783bc 100644
--- a/third_party/fwkacllib/inc/ops/outfeed_ops.h
+++ b/third_party/fwkacllib/inc/ops/outfeed_ops.h
@@ -18,10 +18,10 @@
  * \file outfeed_ops.h
  * \brief
  */
-#ifndef GE_OP_OUTFEED_OPS_H
-#define GE_OP_OUTFEED_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_OUTFEED_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_OUTFEED_OPS_H_
 
 #include "data_flow_ops.h"
 
-#endif  // GE_OP_OUTFEED_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_OUTFEED_OPS_H_
 
diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h
index 567bc63d..e1c2a20d 100644
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@@ -18,8 +18,8 @@
  * \file pad_ops.h
  * \brief
  */
-#ifndef GE_OP_PAD_OPS_H
-#define GE_OP_PAD_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_PAD_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_PAD_OPS_H_
 
 #include "graph/operator_reg.h"
 namespace ge {
@@ -65,6 +65,9 @@ REG_OP(Fill)
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "value".
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Fill instead.
 */
 REG_OP(FillD)
     .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16,
@@ -122,6 +125,9 @@ REG_OP(BroadcastTo)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BroadcastTo.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use BroadcastTo instead.
 */
 REG_OP(BroadcastToD)
     .INPUT(x, TensorType::BasicType())
@@ -169,6 +175,9 @@ REG_OP(Pad)
 
 *@par Third-party framework compatibility:
 * Compatible with TensorFlow operator Pad.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Pad instead.
 */
 REG_OP(PadD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_FLOAT}))
@@ -176,6 +185,60 @@ REG_OP(PadD)
     .REQUIRED_ATTR(paddings, ListListInt)
     .OP_END_FACTORY_REG(PadD)
 
+/**
+*@brief Pads a tensor . \n
+
+*@par Inputs:
+*Three inputs, including:
+* @li x: A Tensor. Must be one of the following types: float16, float32, double, int32,
+*     uint8, int16, int8, complex64, int64, qint8, quint8, qint32, qint16, quint16, uint16,
+*     complex128, uint32, uint64.
+* @li constant_values: A Tensor. Must have the same type as input.
+* @li paddings: A Tensor of type int32 or int64 . \n
+
+*@par Outputs:
+*y: A Tensor of the same type as "x" . \n
+
+*@par Third-party framework compatibility:
+* Compatible with TensorFlow operator Pad.
+*/
+REG_OP(PadV2)
+    .INPUT(x, TensorType::BasicType())
+    .INPUT(paddings, TensorType::IndexNumberType())
+    .INPUT(constant_values, TensorType::BasicType())
+    .OUTPUT(y, TensorType::BasicType())
+    .OP_END_FACTORY_REG(PadV2)
+
+/**
+*@brief Pads a tensor . \n
+
+*@par Inputs:
+*x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32 . \n
+*constant_values: A Tensor. Must have the same type as input.
+
+*@par Attributes:
+*paddings: An optional "vector<vector<int>>". Defaults to "{}".
+*     For each dimension D of input, paddings[D, 0] indicates how many
+*     values to add before the contents of tensor in that dimension,
+*     and paddings[D, 1] indicates how many values to add after the
+*     contents of tensor in that dimension . \n
+
+*@par Outputs:
+*y: A Tensor of the same type as "x" . \n
+
+*@par Third-party framework compatibility:
+* Compatible with TensorFlow operator Pad.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Pad instead.
+*/
+REG_OP(PadV2D)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
+    .INPUT(constant_values, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
+    .REQUIRED_ATTR(paddings, ListListInt)
+    .OP_END_FACTORY_REG(PadV2D)
+
 /**
 *@brief Pads a tensor.
 
@@ -233,6 +296,9 @@ REG_OP(PadV3)
 
 *@par Third-party framework compatibility:
 * Compatible with ONNX operator Pad.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use PadV3 instead.
 */
 REG_OP(PadV3D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8}))
@@ -260,6 +326,9 @@ REG_OP(PadV3D)
 *@see Diag()
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Diag.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Diag instead.
 */
 REG_OP(DiagD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
@@ -328,7 +397,7 @@ REG_OP(AscendPadding)
 */
 REG_OP(EmbeddingRankId)
     .INPUT(addr_table, TensorType({DT_UINT64}))
-    .INPUT(index, TensorType({DT_UINT32}))
+    .INPUT(index, TensorType({DT_INT64,DT_INT32,DT_UINT64}))
     .OUTPUT(rank_id, TensorType({DT_UINT64}))
     .ATTR(row_memory, Int, 320)
     .ATTR(mode, String, "mod")
@@ -336,4 +405,4 @@ REG_OP(EmbeddingRankId)
 
 
 } // namespace ge
-#endif //GE_OP_PAD_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_PAD_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/parsing_ops.h b/third_party/fwkacllib/inc/ops/parsing_ops.h
index b3c50654..5c7adfd8 100644
--- a/third_party/fwkacllib/inc/ops/parsing_ops.h
+++ b/third_party/fwkacllib/inc/ops/parsing_ops.h
@@ -18,8 +18,8 @@
  * \file parsing_ops.h
  * \brief
  */
-#ifndef GE_OP_PARSING_OPS_H
-#define GE_OP_PARSING_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_PARSING_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_PARSING_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -53,4 +53,4 @@ REG_OP(StringToNumber)
 
 }  // namespace ge
 
-#endif  // GE_OP_PARSING_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_PARSING_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/quantize_ops.h b/third_party/fwkacllib/inc/ops/quantize_ops.h
index 31ba266b..b53cfeb6 100644
--- a/third_party/fwkacllib/inc/ops/quantize_ops.h
+++ b/third_party/fwkacllib/inc/ops/quantize_ops.h
@@ -18,8 +18,8 @@
  * \file quantize_ops.h
  * \brief
  */
-#ifndef GE_OP_QUANTIZE_OPS_H
-#define GE_OP_QUANTIZE_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_QUANTIZE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_QUANTIZE_OPS_H_
 #include "graph/operator_reg.h"
 
 namespace ge {
@@ -221,4 +221,4 @@ REG_OP(AscendRequantS16)
 
 } // namespace ge
 
-#endif // GE_OP_QUANTIZE_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_QUANTIZE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/ragged_array_ops.h b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
index 4c62ec86..9b31aa8e 100644
--- a/third_party/fwkacllib/inc/ops/ragged_array_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
@@ -18,8 +18,8 @@
  * \file ragged_array_ops.h
  * \brief
  */
-#ifndef GE_OP_RAGGED_ARRAY_OPS_H
-#define GE_OP_RAGGED_ARRAY_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RAGGED_ARRAY_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RAGGED_ARRAY_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -62,4 +62,4 @@ REG_OP(RaggedGather)
 
 }  // namespace ge
 
-#endif //GE_OP_RAGGED_ARRAY_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RAGGED_ARRAY_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
index ec88c618..13488a25 100644
--- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
@@ -18,8 +18,8 @@
  * \file ragged_conversion_ops.h
  * \brief
  */
-#ifndef GE_OP_RAGGED_CONVERSION_OPS_H
-#define GE_OP_RAGGED_CONVERSION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RAGGED_CONVERSION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RAGGED_CONVERSION_OPS_H_
 #include "graph/operator_reg.h"
 
 namespace ge {
@@ -30,7 +30,7 @@ namespace ge {
 *@par Inputs:
 *Two inputs, including:
 *@li rt_nested_splits: A list of at least 1 Tensor objects with the same type
-in: int32, int64. The row_splits for the RaggedTensor.
+in: int32, int64. The row_splits for the RaggedTensor. It's a dynamic input.
 *@li rt_dense_values: A Tensor. The flat_values for the RaggedTensor
 Must be one of the following types: bool, int8, int16, uint16, int32,
 int64, double, float, float16 . \n
@@ -66,7 +66,7 @@ REG_OP(RaggedTensorToSparse)
 *@li values:A 1D tensor representing the values of the ragged tensor.
 *@li default_value:A `Tensor`. Must have the same type as `values`.
 *@li row_partition_tensors:A list of at least 1 `Tensor` objects with the same
-type in: `int64`, `int32` .\n
+type in: `int64`, `int32` . It's a dynamic input.\n
 
 *@par Attributes:
 *@li num_row_partition_tensors:Numbers of row partition tensors.
@@ -95,4 +95,4 @@ REG_OP(RaggedTensorToTensor)
 
 
 } // namespace ge
-#endif // GE_OP_RAGGED_CONVERSION_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RAGGED_CONVERSION_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/ragged_math_ops.h b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
index ab871b7e..8af4f867 100644
--- a/third_party/fwkacllib/inc/ops/ragged_math_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
@@ -18,8 +18,8 @@
  * \file ragged_math_ops.h
  * \brief
  */
-#ifndef GE_OP_RAGGED_MATH_OPS_H
-#define GE_OP_RAGGED_MATH_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RAGGED_MATH_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RAGGED_MATH_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -57,4 +57,4 @@ REG_OP(RaggedRange)
 
 }  // namespace ge
 
-#endif //GE_OP_RAGGED_MATH_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RAGGED_MATH_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/random_ops.h b/third_party/fwkacllib/inc/ops/random_ops.h
index 24a9edd1..b46da435 100644
--- a/third_party/fwkacllib/inc/ops/random_ops.h
+++ b/third_party/fwkacllib/inc/ops/random_ops.h
@@ -18,8 +18,8 @@
  * \file random_ops.h
  * \brief
  */
-#ifndef GE_OP_RANDOM_OPS_H_
-#define GE_OP_RANDOM_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RANDOM_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RANDOM_OPS_H_
 
 #include <vector>
 
@@ -374,6 +374,9 @@ REG_OP(DropOutGenMask)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator lin_space.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use LinSpace instead.
 */
 REG_OP(LinSpaceD)
     .INPUT(assist, TensorType({DT_FLOAT}))
@@ -408,6 +411,25 @@ REG_OP(LinSpace)
     .OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE}))
     .OP_END_FACTORY_REG(LinSpace)
 
+
+
+/**
+*@brief The dropout operator randomly sets (according to the given dropout probability)
+*the outputs of some units to zero, while others are remain unchanged. . \n
+
+*@par Inputs:
+*One input, including:
+*@li x:The input tensor variable. The data type is float32. \n
+
+*@par Attributes:
+*@li dropout_ratio:Float between 0 and 1. Fraction of the input units to drop.Defaults to "0.5".
+*@li scale_train: Bool,default to true.
+*@li alpha: An optional float32. A scaling factor. Defaults to "1.0".
+*@li beta: An optional float32. An exponent. Defaults to "0.0". \n
+
+*@par Outputs:
+*y: A Variable holding Tensor representing the dropout, has same shape and data type with x. \n
+*/
 REG_OP(Dropout)
     .INPUT(x, TensorType{DT_FLOAT})
     .OUTPUT(y, TensorType{DT_FLOAT})
@@ -475,4 +497,4 @@ REG_OP(ShuffleChannel)
     .OP_END_FACTORY_REG(ShuffleChannel)
 }   // namespace ge
 
-#endif  // GE_OP_RANDOM_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RANDOM_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h
index 80169344..6f44093e 100644
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -18,8 +18,8 @@
  * \file reduce_ops.h
  * \brief
  */
-#ifndef GE_OP_REDUCE_OPS_H
-#define GE_OP_REDUCE_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -353,6 +353,9 @@ REG_OP(ReduceSum)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Sum.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceSum instead.
 */
 REG_OP(ReduceSumD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -378,6 +381,9 @@ REG_OP(ReduceSumD)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ReduceAll.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceAll instead.
 */
 REG_OP(ReduceAllD)
     .INPUT(x, TensorType({DT_BOOL}))
@@ -453,6 +459,9 @@ REG_OP(ReduceProd)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ReduceProd.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceProd instead.
 */
 REG_OP(ReduceProdD)
     .INPUT(x,TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_INT32, DT_FLOAT16}))
@@ -507,6 +516,9 @@ REG_OP(ReduceMean)
 
 *@par Third-party framework compatibility:
 * Compatible with the TensorFlow operator ReduceMean.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMean instead.
 */
 REG_OP(ReduceMeanD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -561,6 +573,9 @@ REG_OP(ReduceMax)
 
 *@par Third-party framework compatibility
 * Compatible with TensorFlow operator Max.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMax instead.
 */
 REG_OP(ReduceMaxD)
     .INPUT(x, TensorType({DT_FLOAT, DT_UINT8, DT_INT8,
@@ -615,6 +630,9 @@ REG_OP(ReduceMin)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator reduce_min.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMin instead.
 */
 REG_OP(ReduceMinD)
     .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
@@ -681,6 +699,9 @@ REG_OP(ReduceAny)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator reduce_any.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceAny instead.
 */
 REG_OP(ReduceAnyD)
     .INPUT(x, TensorType({DT_BOOL}))
@@ -766,6 +787,9 @@ REG_OP(EuclideanNorm)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator EuclideanNorm.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use EuclideanNorm instead.
 */
 REG_OP(EuclideanNormD)
     .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_FLOAT16}))
@@ -960,4 +984,4 @@ REG_OP(GNTrainingUpdate)
 
 } //namespace ge
 
-#endif /* GE_OP_REDUCE_OPS_H */
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/resource_variable_ops.h b/third_party/fwkacllib/inc/ops/resource_variable_ops.h
index fdc76391..1b60d42a 100644
--- a/third_party/fwkacllib/inc/ops/resource_variable_ops.h
+++ b/third_party/fwkacllib/inc/ops/resource_variable_ops.h
@@ -18,8 +18,8 @@
  * \file resource_variable_ops.h
  * \brief
  */
-#ifndef GE_OP_RESOURCE_VARIABLE_OPS_H
-#define GE_OP_RESOURCE_VARIABLE_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RESOURCE_VARIABLE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RESOURCE_VARIABLE_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -111,4 +111,4 @@ REG_OP(AssignSubVariableOp)
 
 }  // namespace ge
 
-#endif //GE_OP_RESOURCE_VARIABLE_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RESOURCE_VARIABLE_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h
index 0766d2c6..84723872 100644
--- a/third_party/fwkacllib/inc/ops/rnn.h
+++ b/third_party/fwkacllib/inc/ops/rnn.h
@@ -18,8 +18,8 @@
  * \file rnn.h
  * \brief
  */
-#ifndef GE_OP_RNN_H
-#define GE_OP_RNN_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RNN_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RNN_H_
 
 #include "graph/operator_reg.h"
 
@@ -92,7 +92,6 @@ REG_OP(DynamicLSTM)
     .OUTPUT(output_h, TensorType({DT_FLOAT32}))
     .OP_END_FACTORY_REG(DynamicLSTM)
 
-
 /**
 *@brief: DynamicRNNGrad calculation.
 *@par Inputs:
@@ -113,8 +112,8 @@ REG_OP(DynamicLSTM)
 *@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
-*@li seq_length:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
-*@li mask:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li seq_length:A 1D Tensor. Must be one of the following types: int32.
+*@li mask:A 1D Tensor. Must be one of the following types: int8.
 *@li wci:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wcf:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wco:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
@@ -127,7 +126,7 @@ REG_OP(DynamicLSTM)
 *@li keep_prob:An float identifying the keep prob in the op. Default to 1.
 *@li cell_clip:An float identifying the cell clip in the op. Default to -1.
 *@li num_proj:An integer identifying the num projection in the op. Default to 0.
-*@li time_major:An bool identifying the time major in the op. Default to true.
+*@li time_major:An bool identifying the time major in the op. Default to false.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported.
 *@li forget_bias:An float identifying the forget bias in the op. Default to 0.
 *@li is_training:An bool identifying is training in the op. Default to true.
@@ -592,6 +591,116 @@ REG_OP(DynamicGRUV2)
     .ATTR(reset_after, Bool, true)
     .ATTR(is_training, Bool, true)
     .OP_END_FACTORY_REG(DynamicGRUV2)
+
+/**
+*@brief: DynamicGRUV2Grad calculation.
+*@par Inputs:
+*fourteen inputs: \n
+*@li x:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li weight_input:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li weight_hidden:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li y:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li init_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dy:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dh:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li update:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li reset:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li new:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li hidden_new:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li seq_length:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li mask:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+
+*@par Attributes:
+*@li direction:An string identifying the direction in the op. Default to "UNIDIRECTIONAL". Only UNIDIRECTIONAL is currently supported.
+*@li cell_depth:An integer identifying the cell depth in the op. Default to 1.
+*@li keep_prob:An float identifying the keep prob in the op. Default to 1.
+*@li cell_clip:An float identifying the cell clip in the op. Default to -1.
+*@li num_proj:An integer identifying the num projection in the op. Default to 0.
+*@li time_major:An bool identifying the time major in the op. Default to true.
+*@li bias_type:An string identifying the type of bias_type function in the op. Default to "double_bias".
+*@li gate_order:An string identifying the gate order in weight and bias. Default to "zrh". "rzh" is another option.
+*@li reset_after:An bool identifying whether to apply reset gate after matrix multiplication. Default to true.
+
+*@par Outputs:
+*six outputs: \n
+*@li dw_input:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dw_hidden:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li db_input:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li db_hidden:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dx:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dh_prev:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*/
+REG_OP(DynamicGRUV2Grad)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(weight_input, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(weight_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(dh, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(update, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(reset, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(new, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(hidden_new, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OPTIONAL_INPUT(seq_length, TensorType({DT_INT32}))
+    .OPTIONAL_INPUT(mask, TensorType({DT_UINT8}))
+    .OUTPUT(dw_input, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(dw_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(db_input, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(db_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(dx, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(dh_prev, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(direction, String, "UNIDIRECTIONAL")
+    .ATTR(cell_depth, Int, 0)
+    .ATTR(keep_prob, Float, -1.0)
+    .ATTR(cell_clip, Float, -1.0)
+    .ATTR(num_proj, Int, 0)
+    .ATTR(time_major, Bool, true)
+    .ATTR(bias_type, String, "double_bias")
+    .ATTR(gate_order, String, "zrh")
+    .ATTR(reset_after, Bool, true)
+    .OP_END_FACTORY_REG(DynamicGRUV2Grad)
+
+/**
+*@brief: GRUV2HiddenGrad calculation.
+*@par Inputs:
+*nine inputs: \n
+*@li weight_hidden:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li init_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dy:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dh:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li update:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li reset:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li new:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li hidden_new:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+
+*@par Attributes:
+*@li gate_order:An string identifying the gate order in weight and bias. Default to "zrh". "rzh" is another option.
+
+*@par Outputs:
+*three outputs: \n
+*@li dh_prev:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dgate_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dnt_x:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*/
+REG_OP(GRUV2HiddenGrad)
+    .INPUT(weight_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(dh, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(update, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(reset, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(new, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(hidden_new, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(dh_prev, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(dgate_h, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(dnt_x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(gate_order, String, "zrh")
+    .OP_END_FACTORY_REG(GRUV2HiddenGrad)
 }  // namespace ge
 
-#endif  // GE_OP_RNN_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RNN_H_
diff --git a/third_party/fwkacllib/inc/ops/rpn_ops.h b/third_party/fwkacllib/inc/ops/rpn_ops.h
index 39583293..b7649a44 100644
--- a/third_party/fwkacllib/inc/ops/rpn_ops.h
+++ b/third_party/fwkacllib/inc/ops/rpn_ops.h
@@ -18,8 +18,8 @@
  * \file rpn_ops.h
  * \brief
  */
-#ifndef GE_OP_RPN_OPS_H
-#define GE_OP_RPN_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_RPN_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_RPN_OPS_H_
 
 #include "graph/operator_reg.h"
 namespace ge {
@@ -58,4 +58,4 @@ REG_OP(NMSWithMask)
     .OP_END_FACTORY_REG(NMSWithMask)
 }  // namespace ge
 
-#endif // GE_OP_TRAINING_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_RPN_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/save_ops.h b/third_party/fwkacllib/inc/ops/save_ops.h
index 7fd853d3..0ce473b7 100644
--- a/third_party/fwkacllib/inc/ops/save_ops.h
+++ b/third_party/fwkacllib/inc/ops/save_ops.h
@@ -18,8 +18,8 @@
  * \file save_ops.h
  * \brief
  */
-#ifndef GE_OP_SAVE_OPS_H_
-#define GE_OP_SAVE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SAVE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SAVE_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -28,7 +28,7 @@ namespace ge {
 /**
 *@brief Mark which tensors need to be saved to the ckpt file.
 *@par Inputs:
-*tensors: A list of input tensor.
+*tensors: A list of input tensor.It's a dynamic input.
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
@@ -39,4 +39,4 @@ REG_OP(Save)
 } // namespace ge
 
 
-#endif  // GE_OP_SAVE_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SAVE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h
index acf1c34d..cbd9839d 100644
--- a/third_party/fwkacllib/inc/ops/sdca_ops.h
+++ b/third_party/fwkacllib/inc/ops/sdca_ops.h
@@ -18,8 +18,8 @@
  * \file sdca_ops.h
  * \brief
  */
-#ifndef GE_OP_SDCA_OPS_H
-#define GE_OP_SDCA_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SDCA_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SDCA_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -35,16 +35,16 @@ namespace ge {
 *rate . \n
 
 *@par Inputs:
-*@li sparse_example_indices: a list of vectors which contain example indices.
-*@li sparse_feature_indices: a list of vectors which contain feature indices.
-*@li sparse_feature_values: a list of vectors which contains feature value associated with each feature group.
-*@li dense_features: a list of matrices which contains the dense feature values.
+*@li sparse_example_indices: a list of vectors which contain example indices.It's a dynamic input.
+*@li sparse_feature_indices: a list of vectors which contain feature indices.It's a dynamic input.
+*@li sparse_feature_values: a list of vectors which contains feature value associated with each feature group.It's a dynamic input.
+*@li dense_features: a list of matrices which contains the dense feature values.It's a dynamic input.
 *@li example_weights: a vector which contains the weight associated with each example.
 *@li example_labels: a vector which contains the label/target associated with each example.
 *@li sparse_indices: a list of vectors where each value is the indices which has
-*corresponding weights in sparse_weights. This field maybe omitted for the dense approach.
+*corresponding weights in sparse_weights. This field maybe omitted for the dense approach.It's a dynamic input.
 *@li sparse_weights: a list of vectors where each value is the weight associated with a sparse feature group.
-*@li dense_weights: a list of vectors where the values are the weights associated with a dense feature group.
+*@li dense_weights: a list of vectors where the values are the weights associated with a dense feature group.It's a dynamic input.
 *@li example_state_data: a list of vectors containing the example state data.
 *@li loss_type: Type of the primal loss. Currently SdcaSolver supports logistic, squared and hinge losses.
 *@li l1: Symmetric l1 regularization strength.
@@ -61,6 +61,7 @@ namespace ge {
 *@par Third-party framework compatibility
 * Compatible with tensorflow SdcaOptimizerV2 operator.
 */
+
 REG_OP(SdcaOptimizerV2)
     .DYNAMIC_INPUT(sparse_example_indices, TensorType({DT_INT64}))
     .DYNAMIC_INPUT(sparse_feature_indices, TensorType({DT_INT64}))
@@ -88,4 +89,4 @@ REG_OP(SdcaOptimizerV2)
 
 }  // namespace ge
 
-#endif //GE_OP_SDCA_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SDCA_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h
index 8ef4a42c..2c99e82e 100644
--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@@ -18,8 +18,8 @@
  * \file selection_ops.h
  * \brief
  */
-#ifndef GE_OP_SELECTION_OPS_H
-#define GE_OP_SELECTION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SELECTION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SELECTION_OPS_H_
 #include "graph/operator_reg.h"
 
 namespace ge {
@@ -79,6 +79,9 @@ REG_OP(Range)
 
 *@see Range()
 *@since V100R001C33
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Range instead.
 */
 REG_OP(RangeD)
     .INPUT(x, TensorType({DT_FLOAT,DT_INT32}))
@@ -223,6 +226,9 @@ REG_OP(GatherV2)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator GatherV2.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use GatherV2 instead.
 */
 REG_OP(GatherV2D)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_UINT32, DT_INT8, DT_UINT8,
@@ -325,6 +331,9 @@ REG_OP(StridedSlice)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSlice.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use StridedSlice instead.
 */
 REG_OP(StridedSliceD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_UINT8, DT_INT8,
@@ -380,6 +389,9 @@ REG_OP(StridedSliceD)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSliceGradD.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use StridedSliceGrad instead.
 */
 REG_OP(StridedSliceGradD)
     .INPUT(dy, TensorType::BasicType())
@@ -491,6 +503,9 @@ REG_OP(UnsortedSegmentSum)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator UnsortedSegmentSum.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentSum instead.
 */
 REG_OP(UnsortedSegmentSumD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8, DT_UINT8}))
@@ -715,6 +730,9 @@ REG_OP(OneHot)
 
 *@par Third-party framework compatibility:
 * Compatible with the TensorFlow operator OneHot.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use OneHot instead.
 */
 REG_OP(OneHotD)
     .INPUT(x, TensorType({DT_UINT8, DT_INT32}))
@@ -790,7 +808,7 @@ REG_OP(SliceD)
 * @li assist_seq: A 1D tensor of type float16.
 * with size of 2N, which "N" is the last dimension.
 * The first N numbers is indices, and the next N numbers is deviation of casting
-* float16 to int32 . \n
+* int32 to float16. \n
 
 * @par Attributes:
 * @li k: A required int that is at least 0, specifying the number of top elements
@@ -799,7 +817,7 @@ REG_OP(SliceD)
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li dim: An optional int. Defaults to -1. For reserved use.
-* @li largest: An optional bool. Defaults to true. For reserved use.
+* @li largest: An optional bool. Defaults to true. For reserved use. \n
 
 * @par Outputs:
 * @li values: A Tensor, specifying the sorted data. Has the same type as "input".
@@ -807,7 +825,7 @@ REG_OP(SliceD)
 
 * @attention Constraints:
 * @li k =< 5120
-* @li Size of the last dimension =< 65500
+* @li Size of the last dimension =< 1458176
 * @li sorted = true
 * @li It's unstable sorted indices on the platform of Ascend310
 
@@ -903,6 +921,9 @@ REG_OP(ScatterNd)
 *@li "y" has the same type as "x".
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ScatterNd.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ScatterNd instead.
 */
 REG_OP(ScatterNdD)
     .INPUT(indices, TensorType::IndexNumberType())
@@ -1146,6 +1167,9 @@ REG_OP(Cumprod)
 *y: A Tensor. Has the same type as "x".
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Cumprod.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Cumprod instead.
 */
 REG_OP(CumprodD)
     .INPUT(x, TensorType::NumberType())
@@ -1200,6 +1224,9 @@ REG_OP(Cumsum)
 *y: A Tensor. Has the same type as "x".
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Cumsum.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Cumsum instead.
 */
 REG_OP(CumsumD)
     .INPUT(x, TensorType::NumberType())
@@ -1253,6 +1280,9 @@ REG_OP(InplaceUpdate)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator InplaceUpdate.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use InplaceUpdate instead.
 */
 REG_OP(InplaceUpdateD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -1305,6 +1335,9 @@ REG_OP(InplaceAdd)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator InplaceAdd.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use InplaceAdd instead.
 */
 REG_OP(InplaceAddD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -1356,6 +1389,9 @@ REG_OP(InplaceSub)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator InplaceSub.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use InplaceSub instead.
 */
 REG_OP(InplaceSubD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -1407,6 +1443,9 @@ REG_OP(ScatterNonAliasingAdd)
 * @par Outputs:
 * y: A Tensor of type RealNumberType . \n
 
+* @attention Constraints:
+* @li segment_ids must be non-negative tensor.
+
 * @see UnsortedSegmentSum(), UnsortedSegmentProd(),
 
 * @par Third-party framework compatibility
@@ -1434,6 +1473,9 @@ REG_OP(UnsortedSegmentMin)
 * @par Outputs:
 * y: A Tensor.Must have the same type as input "x" . \n
 
+* @attention Constraints:
+* @li segment_ids must be non-negative tensor.
+
 * @see UnsortedSegmentProdD(), UnsortedSegmentSumD(),
 *
 * @par Restrictions:
@@ -1459,6 +1501,9 @@ REG_OP(UnsortedSegmentMinD)
 * @par Outputs:
 * y: A Tensor of type RealNumberType . \n
 
+* @attention Constraints:
+* @li segment_ids must be non-negative tensor.
+
 * @see UnsortedSegmentSum(), UnsortedSegmentProd(),
 
 * @par Third-party framework compatibility
@@ -1486,6 +1531,9 @@ REG_OP(UnsortedSegmentMax)
 * @par Outputs:
 * y: A Tensor.Must have the same type as input "x" . \n
 
+* @attention Constraints:
+* @li segment_ids must be non-negative tensor.
+
 * @see UnsortedSegmentProdD(),
 *
 * @par Restrictions:
@@ -1510,6 +1558,9 @@ REG_OP(UnsortedSegmentMaxD)
 * @par Outputs:
 * y: A Tensor of type NumberType . \n
 
+* @attention Constraints:
+* @li segment_ids must be non-negative tensor.
+
 * @see UnsortedSegmentSum(), UnsortedSegmentMin(),
 
 * @par Third-party framework compatibility
@@ -1541,6 +1592,9 @@ REG_OP(UnsortedSegmentProd)
 * @li segment_ids must be non-negative tensor.
 
 * @see UnsortedSegmentMinD()
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentProd instead.
 */
 REG_OP(UnsortedSegmentProdD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16}))
@@ -1856,6 +1910,9 @@ REG_OP(CumulativeLogsumexp)
 *y: A Tensor. Has the same type as "x".
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Cumsum.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use CumulativeLogsumexp instead.
 */
 REG_OP(CumulativeLogsumexpD)
     .INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16}))
@@ -1866,4 +1923,4 @@ REG_OP(CumulativeLogsumexpD)
     .OP_END_FACTORY_REG(CumulativeLogsumexpD)
 } // namespace ge
 
-#endif // GE_OP_SELECTION_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SELECTION_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/set_ops.h b/third_party/fwkacllib/inc/ops/set_ops.h
index 18df6edf..1d02fa15 100644
--- a/third_party/fwkacllib/inc/ops/set_ops.h
+++ b/third_party/fwkacllib/inc/ops/set_ops.h
@@ -18,8 +18,8 @@
  * \file set_ops.h
  * \brief
  */
-#ifndef GE_OP_SET_OPS_H_
-#define GE_OP_SET_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SET_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SET_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -178,4 +178,4 @@ REG_OP(SetSize)
     .OP_END_FACTORY_REG(SetSize)
 }  // namespace ge
 
-#endif  // GE_OP_SET_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SET_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/sparse_ops.h b/third_party/fwkacllib/inc/ops/sparse_ops.h
index 3eecbeab..d7512790 100644
--- a/third_party/fwkacllib/inc/ops/sparse_ops.h
+++ b/third_party/fwkacllib/inc/ops/sparse_ops.h
@@ -18,8 +18,8 @@
  * \file sparse_ops.h
  * \brief
  */
-#ifndef GE_OP_SPARSE_OPS_H_
-#define GE_OP_SPARSE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -1044,4 +1044,4 @@ REG_OP(DeserializeManySparse)
     .OP_END_FACTORY_REG(DeserializeManySparse)
 }  // namespace ge
 
-#endif  // GE_OP_SPARSE_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SPARSE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/spectral_ops.h b/third_party/fwkacllib/inc/ops/spectral_ops.h
index 460dada4..64fa7814 100644
--- a/third_party/fwkacllib/inc/ops/spectral_ops.h
+++ b/third_party/fwkacllib/inc/ops/spectral_ops.h
@@ -18,8 +18,8 @@
  * \file spectral_ops.h
  * \brief
  */
-#ifndef GE_OP_SPECTRAL_OPS_H
-#define GE_OP_SPECTRAL_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPECTRAL_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SPECTRAL_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -49,4 +49,4 @@ REG_OP(RFFT)
 
 }  // namespace ge
 
-#endif //GE_OP_SPECTRAL_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SPECTRAL_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/split_combination_ops.h b/third_party/fwkacllib/inc/ops/split_combination_ops.h
index b66a0213..efe4715d 100644
--- a/third_party/fwkacllib/inc/ops/split_combination_ops.h
+++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h
@@ -18,8 +18,8 @@
  * \file split_combination_ops.h
  * \brief
  */
-#ifndef GE_OP_SPLIT_COMBINATION_OPS_H
-#define GE_OP_SPLIT_COMBINATION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SPLIT_COMBINATION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SPLIT_COMBINATION_OPS_H_
 #include "graph/operator_reg.h"
 
 namespace ge {
@@ -75,6 +75,9 @@ REG_OP(Split)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Split.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use Split instead.
 */
 REG_OP(SplitD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -141,6 +144,9 @@ Under the caffe framework, the conversion of slice_point through the cut point t
 Under the caffe framework,size_splits or axis transformat to split_dim.Only one can effect.
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SplitV.
+
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SplitV instead.
 */
 REG_OP(SplitVD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -158,7 +164,8 @@ REG_OP(SplitVD)
 * Two inputs, including:
 * @li values: A list of Tensors. Must be one of the following types: int8, int16, int32,
 *     int64, uint8, uint16, uint32, uint64, float16, float32.
-*     Tensors to be concatenated. All must have size 1 in the first dimension and same shape. 
+*     Tensors to be concatenated. All must have size 1 in the first dimension and same shape.
+*     It's a dynamic input.
 * @li shape: A Tensor of the same type as "x".
 * The final shape of the result. Should be equal to the shapes of any input
 * but with the number of input values in the first dimension . \n
@@ -307,7 +314,7 @@ REG_OP(Concat)
 
 *@par Inputs:
 * x: A list of N Tensors. Must be one of the following types: int8, int16, int32,
-*     int64, uint8, uint16, uint32, uint64, float16, float32, bool . \n
+*     int64, uint8, uint16, uint32, uint64, float16, float32, bool . It's a dynamic input. \n
 
 *@par Attributes:
 *@li axis: A optional int, defaultvalue is 0.
@@ -333,7 +340,7 @@ REG_OP(Pack)
 *@par Inputs:
 *Two inputs, including:
 * @li concat_dim: A Tensor of type int32.
-* @li x: A list of 1D Tensor objects of type int32 . \n
+* @li x: A list of 1D Tensor objects of type int32 . It's a dynamic input. \n
 
 *@par Attributes:
 *N: A required int . \n
@@ -357,7 +364,7 @@ REG_OP(ConcatOffset)
 *@par Inputs:
 *Two inputs, including:
 * @li concat_dim: A Tensor of type int32.
-* @li x: A list of 1D Tensor objects of type int32 . \n
+* @li x: A list of 1D Tensor objects of type int32 . It's a dynamic input. \n
 
 *@par Attributes:
 *@li Concat_dim: A required int. Must be within the rank of input "x".
@@ -379,4 +386,4 @@ REG_OP(ConcatOffsetD)
     .OP_END_FACTORY_REG(ConcatOffsetD)
 }  // namespace ge
 
-#endif  // GE_OP_SPLIT_COMBINATION_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SPLIT_COMBINATION_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/state_ops.h b/third_party/fwkacllib/inc/ops/state_ops.h
index ca85067b..db1f5353 100644
--- a/third_party/fwkacllib/inc/ops/state_ops.h
+++ b/third_party/fwkacllib/inc/ops/state_ops.h
@@ -18,8 +18,8 @@
  * \file state_ops.h
  * \brief
  */
-#ifndef GE_OP_STATE_OPS_H_
-#define GE_OP_STATE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_STATE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_STATE_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -164,4 +164,4 @@ REG_OP(CountUpTo)
 
 }  // namespace ge
 
-#endif  // GE_OP_STATE_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_STATE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/stateful_random_ops.h b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
index 779e7cea..366112d6 100644
--- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
@@ -18,8 +18,8 @@
  * \file stateful_random_ops.h
  * \brief
  */
-#ifndef GE_OP_STATEFUL_RANDOM_OPS_H
-#define GE_OP_STATEFUL_RANDOM_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_STATEFUL_RANDOM_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_STATEFUL_RANDOM_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -233,4 +233,4 @@ REG_OP(StatefulUniformInt)
 
 }  // namespace ge
 
-#endif //GE_OP_STATELESS_RANDOM_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_STATEFUL_RANDOM_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/stateless_random_ops.h b/third_party/fwkacllib/inc/ops/stateless_random_ops.h
index d91bc38a..dad3c379 100644
--- a/third_party/fwkacllib/inc/ops/stateless_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateless_random_ops.h
@@ -18,8 +18,8 @@
  * \file stateless_random_ops.h
  * \brief
  */
-#ifndef GE_OP_STATELESS_RANDOM_OPS_H
-#define GE_OP_STATELESS_RANDOM_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_STATELESS_RANDOM_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_STATELESS_RANDOM_OPS_H_
 
 #include "graph/operator.h"
 #include "graph/operator_reg.h"
@@ -81,4 +81,4 @@ REG_OP(StatelessRandomUniformInt)
 
 }  // namespace ge
 
-#endif //GE_OP_STATELESS_RANDOM_OPS_H
\ No newline at end of file
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_STATELESS_RANDOM_OPS_H_
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/string_ops.h b/third_party/fwkacllib/inc/ops/string_ops.h
index 90ee700d..4a88bc79 100644
--- a/third_party/fwkacllib/inc/ops/string_ops.h
+++ b/third_party/fwkacllib/inc/ops/string_ops.h
@@ -18,8 +18,8 @@
  * \file string_ops.h
  * \brief
  */
-#ifndef GE_OP_STRING_OPS_H_
-#define GE_OP_STRING_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
 
 #include <sstream>
 #include "graph/operator_reg.h"
@@ -559,4 +559,4 @@ REG_OP(DecodeBase64)
     .OP_END_FACTORY_REG(DecodeBase64)
 }  // namespace ge
 
-#endif  // GE_OP_STRING_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/swap_co_ops.h b/third_party/fwkacllib/inc/ops/swap_co_ops.h
index fb25c741..a1bf4f8b 100644
--- a/third_party/fwkacllib/inc/ops/swap_co_ops.h
+++ b/third_party/fwkacllib/inc/ops/swap_co_ops.h
@@ -18,8 +18,8 @@
  * \file swap_co_ops.h
  * \brief
  */
-#ifndef GE_OP_SWAP_CO_OPS_H_
-#define GE_OP_SWAP_CO_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_SWAP_CO_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_SWAP_CO_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -59,4 +59,4 @@ REG_OP(SwapCo)
 
 }  // namespace ge
 
-#endif  // GE_OP_SWAP_CO_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_SWAP_CO_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h
index ed46d95c..290e5880 100644
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -18,8 +18,8 @@
  * \file transformation_ops.h
  * \brief
  */
-#ifndef GE_OP_TRANSFORMATION_OPS_H
-#define GE_OP_TRANSFORMATION_OPS_H
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_TRANSFORMATION_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_TRANSFORMATION_OPS_H_
 
 #include "graph/operator_reg.h"
 
@@ -235,8 +235,12 @@ REG_OP(BatchToSpaceND)
 *@par Outputs:
 *y: A Tensor with format NC1HWC0. Has the same type as input "x".
 
+
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchToSpaceND.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use BatchToSpaceND instead.
 */
 REG_OP(BatchToSpaceNDD)
     .INPUT(x, TensorType::BasicType())
@@ -283,6 +287,9 @@ REG_OP(SpaceToBatchND)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SpaceToBatchND.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SpaceToBatchND instead.
 */
 REG_OP(SpaceToBatchNDD)
     .INPUT(x, TensorType::BasicType())
@@ -404,6 +411,9 @@ REG_OP(BatchToSpace)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchToSpace.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use BatchToSpace instead.
 */
 REG_OP(BatchToSpaceD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8,
@@ -457,6 +467,9 @@ REG_OP(SpaceToBatch)
 *y: A Tensor. Has the same type as input "x".
 *@par Third-party framework compatibility
 *@ Compatible with the TensorFlow operator SpaceToBatch.
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use SpaceToBatch instead.
 */
 REG_OP(SpaceToBatchD)
     .INPUT(x, TensorType::BasicType())
@@ -516,7 +529,9 @@ REG_OP(Unpack)
 * with patch_sizes_eff = patch_sizes + (patch_sizes - 1) *
 * (rates - 1), followed by subsampling them spatially by a factor of rates.
 * This is equivalent to rate in dilated (a.k.a. Atrous) convolutions.
-* @li padding: A required string. The type of padding algorithm to use . \n
+* @li padding: A required string. The type of padding algorithm to use,
+  support "SAME" or "VALID". \n
+* @li data_format: A required string. The format of input, only supported NHWC. \n
 
 * @par Outputs:
 * y: A 4D Tensor with shape [batch, out_rows, out_cols, ksize_rows *
@@ -537,6 +552,7 @@ REG_OP(ExtractImagePatches)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(rates, ListInt)
     .REQUIRED_ATTR(padding, String)
+    .ATTR(data_format, String, "NHWC")
     .OP_END_FACTORY_REG(ExtractImagePatches)
 
 /**
@@ -551,7 +567,9 @@ REG_OP(ExtractImagePatches)
 * dimension of "x".
 * @li strides: A required list or tuple. How far the centers of two consecutive
 * patches are in "x". Must be: [1, stride_planes, stride_rows, stride_cols, 1].
-* @li padding: A required string. The type of padding algorithm to use . \n
+* @li padding: A required string. The type of padding algorithm to use ,
+* support "SAME" or "VALID" . \n
+* @li data_format: An optional string. The format of input, only supported NDHWC. \n
 
 * @par Outputs:
 * Output: A 5D Tensor with shape [batch, out_planes, out_rows, out_cols, ksize_planes *
@@ -570,6 +588,7 @@ REG_OP(ExtractVolumePatches)
     .REQUIRED_ATTR(ksizes, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(padding, String)
+    .ATTR(data_format, String, "NDHWC")
     .OP_END_FACTORY_REG(ExtractVolumePatches)
 
 /**
@@ -585,6 +604,9 @@ REG_OP(ExtractVolumePatches)
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS DEPRECATED. Please use ConfusionTranspose instead.
 */
 REG_OP(ConfusionTransposeD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -695,4 +717,4 @@ REG_OP(CompressFcOp)
 .OP_END_FACTORY_REG(CompressFcOp)
 }  // namespace ge
 
-#endif  // GE_OP_TRANSFORMATION_OPS_H
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_TRANSFORMATION_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/warp_perspective_ops.h b/third_party/fwkacllib/inc/ops/warp_perspective_ops.h
index c96b96be..e19cbd7c 100644
--- a/third_party/fwkacllib/inc/ops/warp_perspective_ops.h
+++ b/third_party/fwkacllib/inc/ops/warp_perspective_ops.h
@@ -18,8 +18,8 @@
  * \file warp_perspective_ops.h
  * \brief
  */
-#ifndef GE_OP_WARP_PERSPECTIVE_OPS_H_
-#define GE_OP_WARP_PERSPECTIVE_OPS_H_
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_WARP_PERSPECTIVE_OPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_WARP_PERSPECTIVE_OPS_H_
 
 #include "graph/operator_reg.h"
 #include "graph/operator.h"
@@ -56,4 +56,4 @@ REG_OP(WarpPerspective)
     .OP_END_FACTORY_REG(WarpPerspective)
 }  // namespace ge
 
-#endif  // GE_OP_WARP_PERSPECTIVE_OPS_H_
+#endif  // OPS_BUILT_IN_OP_PROTO_INC_WARP_PERSPECTIVE_OPS_H_
diff --git a/third_party/fwkacllib/inc/register/op_kernel_registry.h b/third_party/fwkacllib/inc/register/op_kernel_registry.h
index 5fed8960..2c479e92 100644
--- a/third_party/fwkacllib/inc/register/op_kernel_registry.h
+++ b/third_party/fwkacllib/inc/register/op_kernel_registry.h
@@ -41,7 +41,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry {
  private:
   OpKernelRegistry();
   class OpKernelRegistryImpl;
-  /*lint -e148*/
   std::unique_ptr<OpKernelRegistryImpl> impl_;
 };
 } // namespace ge
diff --git a/third_party/fwkacllib/inc/register/op_tiling.h b/third_party/fwkacllib/inc/register/op_tiling.h
index bcd4cd5e..38370819 100644
--- a/third_party/fwkacllib/inc/register/op_tiling.h
+++ b/third_party/fwkacllib/inc/register/op_tiling.h
@@ -70,6 +70,7 @@ struct OpRunInfo {
     uint32_t block_dim;
     std::vector<int64_t> workspaces;
     ByteBuffer tiling_data;
+    bool clear_atomic;
 };
 
 
diff --git a/third_party/fwkacllib/inc/register/ops_kernel_builder_registry.h b/third_party/fwkacllib/inc/register/ops_kernel_builder_registry.h
new file mode 100644
index 00000000..3c8e0470
--- /dev/null
+++ b/third_party/fwkacllib/inc/register/ops_kernel_builder_registry.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_REGISTER_OPS_KERNEL_BUILDER_REGISTRY_H_
+#define INC_REGISTER_OPS_KERNEL_BUILDER_REGISTRY_H_
+
+#include <memory>
+#include "register/register_types.h"
+#include "common/opskernel/ops_kernel_builder.h"
+
+namespace ge {
+using OpsKernelBuilderPtr = std::shared_ptr<OpsKernelBuilder>;
+
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpsKernelBuilderRegistry {
+ public:
+  static OpsKernelBuilderRegistry &GetInstance();
+
+  void Register(const std::string &lib_name, const OpsKernelBuilderPtr &instance);
+
+  void Unregister(const std::string &lib_name);
+
+  void UnregisterAll();
+
+  const std::map<std::string, OpsKernelBuilderPtr> &GetAll() const;
+
+ private:
+  std::map<std::string, OpsKernelBuilderPtr> kernel_builders_;
+};
+
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpsKernelBuilderRegistrar {
+ public:
+  using CreateFn = OpsKernelBuilder *(*)();
+  OpsKernelBuilderRegistrar(const std::string &kernel_lib_name, CreateFn fn);
+  ~OpsKernelBuilderRegistrar();
+
+private:
+  std::string kernel_lib_name_;
+};
+
+#define REGISTER_OPS_KERNEL_BUILDER(kernel_lib_name, builder) \
+    REGISTER_OPS_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_lib_name, builder)
+
+#define REGISTER_OPS_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_lib_name, builder) \
+    REGISTER_OPS_KERNEL_BUILDER_UNIQ(ctr, kernel_lib_name, builder)
+
+#define REGISTER_OPS_KERNEL_BUILDER_UNIQ(ctr, kernel_lib_name, builder)                         \
+  static ::ge::OpsKernelBuilderRegistrar register_op_kernel_builder_##ctr                       \
+      __attribute__((unused)) =                                                                 \
+          ::ge::OpsKernelBuilderRegistrar(kernel_lib_name, []()->::ge::OpsKernelBuilder* {      \
+            return new (std::nothrow) builder();                                                \
+          })
+}  // namespace ge
+
+#endif // INC_REGISTER_OPS_KERNEL_BUILDER_REGISTRY_H_
diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h
index 17243802..3c7afd95 100644
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -100,6 +100,9 @@ typedef enum tagRtError {
     RT_ERROR_MODEL_ID,
     RT_ERROR_MODEL_EXE_FAILED,
     RT_ERROR_END_OF_SEQUENCE,               // end of sequence
+    RT_ERROR_MODEL_EXIT,
+    RT_ERROR_MODEL_EXIT_STREAM_UNBIND,
+    RT_ERROR_MODEL_EXIT_ID,
 
     RT_ERROR_EVENT_BASE                     = 0x07050000,
     RT_ERROR_EVENT_NULL,
@@ -387,6 +390,8 @@ typedef void (*rtErrorCallback)(rtExceptionType);
 
 typedef void (*rtTaskFailCallback)(rtExceptionInfo *exceptionInfo);
 
+typedef void (*rtProfilingCallback)(uint32_t devId, bool isOpenDevice);
+
 /**
  * @ingroup dvrt_base
  * @brief stream handle.
@@ -469,6 +474,14 @@ RTS_API rtError_t rtSetExceptCallback(rtErrorCallback callback);
  */
 RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback);
 
+/**
+ * @ingroup dvrt_base
+ * @brief register callback for deviceid
+ * @param [out] NA
+ * @return RT_ERROR_NONE for ok
+ */
+RTS_API rtError_t rtSetPoriflingCallback(rtProfilingCallback callback);
+
 /**
  * @ingroup dvrt_base
  * @brief notify handle.
@@ -581,6 +594,16 @@ RTS_API rtError_t rtLabelListCpy(rtLabel_t *label, uint32_t labelNumber, void *d
  */
 RTS_API rtError_t rtLabelCreateEx(rtLabel_t *label, rtStream_t stream);
 
+/**
+ * @ingroup dvrt_base
+ * @brief get current thread last stream id and task id 
+ * @param [out] stream id and task id
+ * @param [in] null
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for input null ptr
+ */
+RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskid, uint32_t *streamid);
+
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
diff --git a/third_party/fwkacllib/inc/runtime/context.h b/third_party/fwkacllib/inc/runtime/context.h
index 39651817..21296ca2 100644
--- a/third_party/fwkacllib/inc/runtime/context.h
+++ b/third_party/fwkacllib/inc/runtime/context.h
@@ -149,6 +149,13 @@ RTS_API rtError_t rtGetGroupInfo(int32_t groupId, rtGroupInfo_t* groupInfo, uint
  */
 RTS_API rtError_t rtGetGroupCount(uint32_t *count);
 
+/**
+ * @ingroup rt_context
+ * @brief set context INF mode
+ * @param [in] mode
+ * @return RT_ERROR_NONE for ok
+ */
+RTS_API rtError_t rtSetCtxINFMode(bool mode);
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h
index 0bff548b..dddb1e10 100644
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -339,6 +339,23 @@ RTS_API rtError_t rtGetPairDevicesInfo(uint32_t devId, uint32_t otherDevId, int3
  * @return RT_ERROR_NONE for ok
  */
 RTS_API rtError_t rtGetRtCapability(rtFeatureType_t featureType, int32_t featureInfo, int64_t *value);
+
+/**
+ * @ingroup dvrt_dev
+ * @brief set target device for current thread
+ * @param [int] device   the device id
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtSetDeviceWithoutTsd(int32_t device);
+
+/**
+ * @ingroup dvrt_dev
+ * @brief reset all opened device
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtDeviceResetWithoutTsd(int32_t device);
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h
index 2030634a..2fd7799d 100644
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -169,6 +169,18 @@ typedef void (*rtCallback_t)(void *fnData);
  */
 #define RT_DEV_BINARY_MAGIC_ELF_AIVEC 0x41415246
 
+/**
+ * @ingroup rt_kernel
+ * @brief magic number of elf binary for aicube
+ */
+#define RT_DEV_BINARY_MAGIC_ELF_AICUBE 0x41415247
+
+/**
+ * @ingroup rt_kernel
+ * @brief magic number of elf binary for aivector
+ */
+#define RT_DEV_BINARY_MAGIC_ELF_AIVECTOR 0x41415248
+
 /**
  * @ingroup rt_kernel_flags
  * @brief kernel op bit flags
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index a506e94a..0d9e20ce 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -17,9 +17,7 @@
 #ifndef __CCE_RUNTIME_MEM_H__
 #define __CCE_RUNTIME_MEM_H__
 
-/*lint -e7*/
 #include <stddef.h>
-/*lint +e7*/
 #include "base.h"
 #include "config.h"
 #include "stream.h"
@@ -177,6 +175,28 @@ typedef struct tagRtPointerAttributes {
   uint32_t pageSize;
 } rtPointerAttributes_t;
 
+
+typedef struct rtMallocHostSharedMemoryIn {
+    const char* name;
+    const uint64_t size;
+    uint32_t flag;
+} rtMallocHostSharedMemoryIn;
+
+typedef struct rtMallocHostSharedMemoryOut {
+    int fd;
+    void* ptr;
+    void* devPtr;
+} rtMallocHostSharedMemoryOut;
+
+typedef struct rtFreeHostSharedMemoryIn {
+    const char* name;
+    const uint64_t size;
+    int fd;
+    void* ptr;
+    void* devPtr;
+} rtFreeHostSharedMemoryIn;
+
+
 /**
  * @ingroup dvrt_mem
  * @brief alloc device memory
@@ -235,6 +255,28 @@ RTS_API rtError_t rtMallocHost(void **hostPtr, uint64_t size);
  */
 RTS_API rtError_t rtFreeHost(void *hostPtr);
 
+/**
+ * @ingroup dvrt_mem
+ * @brief alloc host shared memory
+ * @param [in] in   alloc host shared memory inputPara pointer
+ * @param [in] out   alloc host shared memory outputInfo pointer
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+
+RTS_API rtError_t rtMallocHostSharedMemory(rtMallocHostSharedMemoryIn *in,
+    rtMallocHostSharedMemoryOut *out);
+
+/**
+ * @ingroup dvrt_mem
+ * @brief free host memory
+ * @param [in] in   free host shared memory inputPara pointer
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+
+RTS_API rtError_t rtFreeHostSharedMemory(rtFreeHostSharedMemoryIn *in);
+
 /**
  * @ingroup dvrt_mem
  * @brief alloc managed memory
diff --git a/third_party/fwkacllib/inc/runtime/rt_model.h b/third_party/fwkacllib/inc/runtime/rt_model.h
index 59a1ba7d..5d49c32a 100644
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@@ -49,6 +49,7 @@ typedef enum tagModelTaskType {
     RT_MODEL_TASK_MEMCPY_ADDR_ASYNC,
     RT_MODEL_TASK_STREAM_LABEL_SWITCH_BY_INDEX,
     RT_MODEL_TASK_STREAM_LABEL_GOTO,
+    RT_MODEL_TASK_MODEL_EXIT,
 } rtModelTaskType_t;
 
 typedef enum tagModelStreamType {
@@ -224,6 +225,13 @@ typedef struct tagrtModelEndGraphTaskInfo {
     uint32_t reserved[8];
 } rtModelEndGraphTaskInfo_t;
 
+typedef struct tagrtModelExitInfo {
+    uint32_t modelId;
+    uint32_t streamId;
+    uint32_t reserved[8];
+} rtModelExitTaskInfo_t;
+
+
 typedef struct tagrtStreamLabelSwitchByIndexTask_t {
     uint64_t indexPtr;
     uint64_t labelInfoPtr;
@@ -256,6 +264,7 @@ typedef struct tagTaskInfo {
         rtRdmaSendTaskInfo_t rdmaSendTask;
         rtRdmaDbSendTaskInfo_t rdmaDbSendTask;
         rtModelEndGraphTaskInfo_t modelEndGraphTask;
+        rtModelExitTaskInfo_t modelExitTask;
         rtStreamSwitchNTaskInfo_t streamSwitchNTask;
         rtStreamLabelSwitchByIndexTask_t streamLabelSwitchIndexTask;
         rtStreamLabelGotoTask_t streamLabelGotoTask;
@@ -389,6 +398,16 @@ RTS_API rtError_t rtModelExecutorSet(rtModel_t model, uint8_t flags);
  */
 RTS_API rtError_t rtModelAbort(rtModel_t model);
 
+/**
+ * @ingroup rt_model
+ * @brief end graph task to model default stream
+ * @param [in] model   model to execute
+ * @param [in] end graph stream
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtModelExit(rtModel_t model, rtStream_t stream);
+
 /**
  * @ingroup rt_model
  * @brief bind queue
diff --git a/third_party/fwkacllib/inc/tdt/tdt_host_interface.h b/third_party/fwkacllib/inc/tdt/tdt_host_interface.h
index 0e62a85c..1cab6fd1 100644
--- a/third_party/fwkacllib/inc/tdt/tdt_host_interface.h
+++ b/third_party/fwkacllib/inc/tdt/tdt_host_interface.h
@@ -135,6 +135,93 @@ int32_t TdtHostPopData(const std::string &channelName, std::vector<DataItem> &it
 * @li tdt_host_interface.h: Header file where the interface declaration is located.
 */
 int32_t TdtHostStop(const std::string &channelName);
+
+/**
+* @ingroup TdtInFeedInit
+* @brief Initialize the interface, start and initialize various general thread, log and other services
+*
+* @par Function
+* Initialize the interface, start and initialize various general thread, log and other services
+*
+* @param  deviceId [IN] type #unsigned int. logic device ID
+* @retval #0 Success
+* @retval #Not 0 Fail
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tdt_host_interface.h: Header file where the interface declaration is located.
+*/
+int32_t TdtInFeedInit(uint32_t deviceId);
+
+/**
+* @ingroup TdtOutFeedInit
+* @brief Initialize the interface, start and initialize various general thread, log and other services
+*
+* @par Function
+* Initialize the interface, start and initialize various general thread, log and other services
+*
+* @param  deviceId [IN] type #unsigned int. logic device ID
+* @retval #0 Success
+* @retval #Not 0 Fail
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tdt_host_interface.h: Header file where the interface declaration is located.
+*/
+int32_t TdtOutFeedInit(uint32_t deviceId);
+
+/**
+* @ingroup TdtInFeedDestroy
+* @brief Notify TDT component to close related resources
+*
+* @par Function
+* Notify TDT component to close related resources
+*
+* @param  NA
+* @retval 0 Success
+* @retval OtherValues Fail
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tdt_host_interface.h: Header file where the interface declaration is located.
+*/
+int32_t TdtInFeedDestroy(uint32_t deviceId);
+
+/**
+* @ingroup TdtOutFeedDestroy
+* @brief Notify TDT component to close related resources
+*
+* @par Function
+* Notify TDT component to close related resources
+*
+* @param  NA
+* @retval 0 Success
+* @retval OtherValues Fail
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tdt_host_interface.h: Header file where the interface declaration is located.
+*/
+int32_t TdtOutFeedDestroy();
+
+/**
+* @ingroup TdtInFeedData
+* @brief Blocking queue. When the queue is full, the Push interface will block.
+*
+* @par Function
+* Blocking queue. When the queue is full, the Push interface will block.
+*
+* @param channelName [IN] type #String. queue channel name
+* @param items [IN] type #vector<DataItem> DataItem is defined in data_common.h.  input data
+* @retval 0 Success
+* @retval OtherValues 0 Fail
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tdt_host_interface.h: Header file where the interface declaration is located.
+* @li data_common.h: Header file where 'DataItem' defined
+*/
+int32_t TdtInFeedData(const std::string &channelName, const std::vector<DataItem> &item, uint32_t deviceId);
 }  // namespace tdt
 #ifdef __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
index 4f216239..c8715041 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
@@ -152,4 +152,13 @@ MSVP_PROF_API int32_t ProfStopProfiling(const ProfConfig *profStopCfg);
  */
 MSVP_PROF_API int32_t ProfFinalize();
 
+/**
+ * @name  ProfGetDataTypeConfig
+ * @brief get dataTypeConfig started with of one device
+ * @param deviceId          [IN] deviceId to get dataTypeConfig
+ * @param dataTypeConfig    [OUT] result get
+ * @return ProfErrorCode
+ */
+MSVP_PROF_API int32_t ProfGetDataTypeConfig(uint32_t deviceId, uint64_t &dataTypeConfig);
+
 #endif  // MSPROF_ENGINE_PROF_ACL_API_H_
diff --git a/third_party/fwkacllib/inc/toolchain/slog.h b/third_party/fwkacllib/inc/toolchain/slog.h
index 2cb00a05..bce58f32 100644
--- a/third_party/fwkacllib/inc/toolchain/slog.h
+++ b/third_party/fwkacllib/inc/toolchain/slog.h
@@ -25,10 +25,20 @@ extern "C" {
 #define LINUX 0
 #endif // LINUX
 
+#ifndef WIN
+#define WIN 1
+#endif
+
 #ifndef OS_TYPE
 #define OS_TYPE 0
 #endif // OS_TYPE
 
+#if (OS_TYPE == LINUX)
+#define DLL_EXPORT __attribute__((visibility("default")))
+#else
+#define DLL_EXPORT _declspec(dllexport)
+#endif
+
 /**
  * @ingroup slog
  *
@@ -180,12 +190,11 @@ enum {
   INVLID_MOUDLE_ID
 };
 
-#if (OS_TYPE == LINUX)
 /**
  * @ingroup slog
  * @brief External log interface, which called by modules
  */
-extern void dlog_init(void);
+DLL_EXPORT void dlog_init(void);
 
 /**
  * @ingroup slog
@@ -195,7 +204,7 @@ extern void dlog_init(void);
  * @param [out]enableEvent: 1: enable; 0: disable
  * @return: module level(0: debug, 1: info, 2: warning, 3: error, 4: null output)
  */
-extern int dlog_getlevel(int moduleId, int *enableEvent);
+DLL_EXPORT int dlog_getlevel(int moduleId, int *enableEvent);
 
 /**
  * @ingroup slog
@@ -206,7 +215,7 @@ extern int dlog_getlevel(int moduleId, int *enableEvent);
  * @param [in]enableEvent: 1: enable; 0: disable, others:invalid
  * @return: 0: SUCCEED, others: FAILED
  */
-extern int dlog_setlevel(int moduleId, int level, int enableEvent);
+DLL_EXPORT int dlog_setlevel(int moduleId, int level, int enableEvent);
 
 /**
  * @ingroup slog
@@ -217,7 +226,7 @@ extern int dlog_setlevel(int moduleId, int level, int enableEvent);
  * @param [in]logLevel: eg: DLOG_EVENT/DLOG_ERROR/DLOG_WARN/DLOG_INFO/DLOG_DEBUG
  * @return: 1:enable, 0:disable
  */
-extern int CheckLogLevel(int moduleId, int logLevel);
+DLL_EXPORT int CheckLogLevel(int moduleId, int logLevel);
 
 /**
  * @ingroup slog
@@ -343,7 +352,7 @@ extern int CheckLogLevel(int moduleId, int logLevel);
  * @ingroup slog
  * @brief DlogFlush: flush log buffer to file
  */
-void DlogFlush(void);
+DLL_EXPORT void DlogFlush(void);
 
 /**
  * @ingroup slog
@@ -357,11 +366,6 @@ void DlogEventInner(int moduleId, const char *fmt, ...);
 void DlogInner(int moduleId, int level, const char *fmt, ...);
 void DlogWithKVInner(int moduleId, int level, KeyValue *pstKVArray, int kvNum, const char *fmt, ...);
 
-#else
-_declspec(dllexport) void dlog_init(void);
-_declspec(dllexport) int dlog_getlevel(int moduleId, int *enableEvent);
-#endif // OS_TYPE
-
 #ifdef __cplusplus
 }
 #endif // __cplusplus