diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index 68743bc8..113b8bc6 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -222,6 +222,18 @@ const char *const OPTION_GE_MAX_DUMP_OP_NUM = "ge.maxDumpOpNum"; // Its value should be "0" or "1", default value is "1" const char *const ENABLE_PRINT_OP_PASS = "ge.enablePrintOpPass"; +// Configure operator compilation path +// Its value should be file path, default value is "./" +const char *const DEBUG_DIR = "ge.debugDir"; + +// Configure operator compiler cache path +// Its value should be file path, default value is "./" +const char *const OP_COMPILER_CACHE_DIR = "ge.op_compiler_cache_dir"; + +// Configure operator compiler cache mode +// Its value should be "disable", "enable" or "force", default value is "disable" +const char *const OP_COMPILER_CACHE_MODE = "ge.op_compiler_cache_mode"; + // Configure whether to use single stream. // Its value should be "true" or "false", default value is "false" const char *const ENABLE_SINGLE_STREAM = "ge.enableSingleStream"; @@ -295,7 +307,9 @@ static const char *const OUT_NODES = ge::OUTPUT_NODE_NAME.c_str(); static const char *const INPUT_FP16_NODES = ge::INPUT_FP16_NODES.c_str(); static const char *const LOG_LEVEL = "log"; static const char *const OPTYPELIST_FOR_IMPLMODE = ge::OPTYPELIST_FOR_IMPLMODE.c_str(); - +static const char *const DEBUG_DIR = ge::DEBUG_DIR; +static const char *const OP_COMPILER_CACHE_DIR = ge::OP_COMPILER_CACHE_DIR; +static const char *const OP_COMPILER_CACHE_MODE = ge::OP_COMPILER_CACHE_MODE; // for interface: aclgrphBuildModel const std::set ir_builder_suppported_options = { INPUT_FORMAT, INPUT_SHAPE, OP_NAME_MAP, @@ -317,7 +331,10 @@ const std::set global_options = {CORE_TYPE, FUSION_SWITCH_FILE, ENABLE_SMALL_CHANNEL, OP_SELECT_IMPL_MODE, - OPTYPELIST_FOR_IMPLMODE}; + OPTYPELIST_FOR_IMPLMODE, + DEBUG_DIR, + OP_COMPILER_CACHE_DIR, + OP_COMPILER_CACHE_MODE}; } // namespace ir_option } // namespace ge diff --git a/inc/external/graph/gnode.h b/inc/external/graph/gnode.h index 6ed39dd1..14fb604c 100644 --- a/inc/external/graph/gnode.h +++ b/inc/external/graph/gnode.h @@ -116,9 +116,9 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GNode { bool HasAttr(const ge::AscendString &name); - graphStatus GetSubgraph(uint32_t index, GraphPtr graph) const; + graphStatus GetSubgraph(uint32_t index, GraphPtr &graph) const; - graphStatus GetALLSubgraphs(std::vector graph_list) const; + graphStatus GetALLSubgraphs(std::vector &graph_list) const; private: std::shared_ptr impl_; diff --git a/inc/external/hccl/hccl_types.h b/inc/external/hccl/hccl_types.h index 0d2b9ca5..0e832396 100644 --- a/inc/external/hccl/hccl_types.h +++ b/inc/external/hccl/hccl_types.h @@ -1,101 +1,101 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file hccl_types.h - * @brief HCCL data type definition - * - */ - -#ifndef HCCL_TYPES_H_ -#define HCCL_TYPES_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/** - * @brief HCCL functions return value definition - */ -typedef enum { - HCCL_SUCCESS = 0, /**< success */ - HCCL_E_PARA = 1, /**< parameter error */ - HCCL_E_PTR = 2, /**< empty pointer */ - HCCL_E_MEMORY = 3, /**< memory error */ - HCCL_E_INTERNAL = 4, /**< internal error */ - HCCL_E_NOT_SUPPORT = 5, /**< not support feature */ - HCCL_E_NOT_FOUND = 6, /**< not found specific resource */ - HCCL_E_UNAVAIL = 7, /**< resource unavailable */ - HCCL_E_SYSCALL = 8, /**< call system interface error */ - HCCL_E_TIMEOUT = 9, /**< timeout */ - HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */ - HCCL_E_TCP_CONNECT = 11, /**< tcp connect fail */ - HCCL_E_ROCE_CONNECT = 12, /**< roce connect fail */ - HCCL_E_TCP_TRANSFER = 13, /**< tcp transfer fail */ - HCCL_E_ROCE_TRANSFER = 14, /**< roce transfer fail */ - HCCL_E_RUNTIME = 15, /**< call runtime api fail */ - HCCL_E_DRV = 16, /**< call driver api fail */ - HCCL_E_PROFILING = 17, /**< call profiling api fail */ - HCCL_E_CCE = 18, /**< call cce api fail */ - HCCL_E_NETWORK = 19, /**< call network api fail */ - HCCL_E_RESERVED /**< reserved */ -} HcclResult; - -/** - * @brief handle to HCCL communicator - */ -typedef void *HcclComm; - -/** - * @brief HCCL Reduction opperation - */ -typedef enum { - HCCL_REDUCE_SUM = 0, /**< sum */ - HCCL_REDUCE_PROD = 1, /**< prod */ - HCCL_REDUCE_MAX = 2, /**< max */ - HCCL_REDUCE_MIN = 3, /**< min */ - HCCL_REDUCE_RESERVED /**< reserved */ -} HcclReduceOp; - -/** - * @brief HCCL data type - */ -typedef enum { - HCCL_DATA_TYPE_INT8 = 0, /**< int8 */ - HCCL_DATA_TYPE_INT16 = 1, /**< int16 */ - HCCL_DATA_TYPE_INT32 = 2, /**< int32 */ - HCCL_DATA_TYPE_FP16 = 3, /**< fp16 */ - HCCL_DATA_TYPE_FP32 = 4, /**< fp32 */ - HCCL_DATA_TYPE_INT64 = 5, /**< int64 */ - HCCL_DATA_TYPE_UINT64 = 6, /**< uint64 */ - HCCL_DATA_TYPE_RESERVED /**< reserved */ -} HcclDataType; - -const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length - -/** - * @brief HCCL root info - */ -typedef struct HcclRootInfoDef { - char internal[HCCL_ROOT_INFO_BYTES]; -} HcclRootInfo; - -#ifdef __cplusplus -} -#endif // __cplusplus -#endif // HCCL_TYPES_H_ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hccl_types.h + * @brief HCCL data type definition + * + */ + +#ifndef HCCL_TYPES_H_ +#define HCCL_TYPES_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/** + * @brief HCCL functions return value definition + */ +typedef enum { + HCCL_SUCCESS = 0, /**< success */ + HCCL_E_PARA = 1, /**< parameter error */ + HCCL_E_PTR = 2, /**< empty pointer */ + HCCL_E_MEMORY = 3, /**< memory error */ + HCCL_E_INTERNAL = 4, /**< internal error */ + HCCL_E_NOT_SUPPORT = 5, /**< not support feature */ + HCCL_E_NOT_FOUND = 6, /**< not found specific resource */ + HCCL_E_UNAVAIL = 7, /**< resource unavailable */ + HCCL_E_SYSCALL = 8, /**< call system interface error */ + HCCL_E_TIMEOUT = 9, /**< timeout */ + HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */ + HCCL_E_TCP_CONNECT = 11, /**< tcp connect fail */ + HCCL_E_ROCE_CONNECT = 12, /**< roce connect fail */ + HCCL_E_TCP_TRANSFER = 13, /**< tcp transfer fail */ + HCCL_E_ROCE_TRANSFER = 14, /**< roce transfer fail */ + HCCL_E_RUNTIME = 15, /**< call runtime api fail */ + HCCL_E_DRV = 16, /**< call driver api fail */ + HCCL_E_PROFILING = 17, /**< call profiling api fail */ + HCCL_E_CCE = 18, /**< call cce api fail */ + HCCL_E_NETWORK = 19, /**< call network api fail */ + HCCL_E_RESERVED /**< reserved */ +} HcclResult; + +/** + * @brief handle to HCCL communicator + */ +typedef void *HcclComm; + +/** + * @brief HCCL Reduction opperation + */ +typedef enum { + HCCL_REDUCE_SUM = 0, /**< sum */ + HCCL_REDUCE_PROD = 1, /**< prod */ + HCCL_REDUCE_MAX = 2, /**< max */ + HCCL_REDUCE_MIN = 3, /**< min */ + HCCL_REDUCE_RESERVED /**< reserved */ +} HcclReduceOp; + +/** + * @brief HCCL data type + */ +typedef enum { + HCCL_DATA_TYPE_INT8 = 0, /**< int8 */ + HCCL_DATA_TYPE_INT16 = 1, /**< int16 */ + HCCL_DATA_TYPE_INT32 = 2, /**< int32 */ + HCCL_DATA_TYPE_FP16 = 3, /**< fp16 */ + HCCL_DATA_TYPE_FP32 = 4, /**< fp32 */ + HCCL_DATA_TYPE_INT64 = 5, /**< int64 */ + HCCL_DATA_TYPE_UINT64 = 6, /**< uint64 */ + HCCL_DATA_TYPE_RESERVED /**< reserved */ +} HcclDataType; + +const uint32_t HCCL_ROOT_INFO_BYTES = 4108; // 4108: root info length + +/** + * @brief HCCL root info + */ +typedef struct HcclRootInfoDef { + char internal[HCCL_ROOT_INFO_BYTES]; +} HcclRootInfo; + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // HCCL_TYPES_H_ diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h index c9400628..e5043d94 100644 --- a/inc/framework/common/types.h +++ b/inc/framework/common/types.h @@ -449,6 +449,7 @@ REGISTER_OPTYPE_DECLARE(MEMCPYASYNC, "MemcpyAsync"); REGISTER_OPTYPE_DECLARE(MEMCPYADDRASYNC, "MemcpyAddrAsync"); REGISTER_OPTYPE_DECLARE(STREAMMERGE, "StreamMerge"); REGISTER_OPTYPE_DECLARE(ENDGRAPH, "EndGraph"); +REGISTER_OPTYPE_DECLARE(MODELEXIT, "ModelExit"); REGISTER_OPTYPE_DECLARE(SEND, "Send"); REGISTER_OPTYPE_DECLARE(RECV, "Recv"); REGISTER_OPTYPE_DECLARE(ENDOFSEQUENCE, "EndOfSequence"); diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h index c48d1649..6cc4afd3 100644 --- a/inc/framework/omg/omg_inner_types.h +++ b/inc/framework/omg/omg_inner_types.h @@ -100,6 +100,8 @@ struct OmgContext { std::vector net_out_nodes; // net out nodes top names(only caffe has top) std::vector out_top_names; + // net data nodes top names(only caffe has top) + std::vector data_top_names; // preferential format used by the entire network domiTensorFormat_t net_format = DOMI_TENSOR_RESERVED; domi::FrameworkType type = domi::FRAMEWORK_RESERVED; diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h index d0335eb8..b7b3c3cd 100644 --- a/inc/graph/debug/ge_attr_define.h +++ b/inc/graph/debug/ge_attr_define.h @@ -187,6 +187,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MOD GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_AUTOMIC_ADD_START; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_AUTOMIC_ADD_MEM_SIZE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_LABEL; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_RTS_LABEL_NODE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_CYCLE_EVENT_FLAG; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DYNAMIC_OUTPUT_DIMS; @@ -778,8 +779,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_GEN_VAR_ADDR; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_LABEL; - GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_CONTINUOUS_STREAM_LABEL; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_VAR_SIZE; diff --git a/inc/graph/node.h b/inc/graph/node.h index e1ffe5b7..88dec93e 100644 --- a/inc/graph/node.h +++ b/inc/graph/node.h @@ -95,6 +95,7 @@ class Node : public std::enable_shared_from_this { ComputeGraphPtr GetOwnerComputeGraph() const; graphStatus SetOwnerComputeGraph(const ComputeGraphPtr &graph); + graphStatus SetAnyOwnerComputeGraph(const ComputeGraphPtr &graph); Vistor GetAllInDataAnchors() const; Vistor GetAllOutDataAnchors() const; diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h index 2933d034..cbcd82b1 100644 --- a/inc/graph/utils/graph_utils.h +++ b/inc/graph/utils/graph_utils.h @@ -141,6 +141,8 @@ class GraphUtils { static Graph CreateGraphFromComputeGraph(const ComputeGraphPtr compute_graph); + static GraphPtr CreateGraphPtrFromComputeGraph(const ComputeGraphPtr compute_graph); + static graphStatus RecoverGraphOperators(const Graph &graph); static ComputeGraphPtr CreateGraphFromOperator(const string &name, const std::vector &inputs); diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc index 7b9f023f..25b6aa1e 100644 --- a/src/common/graph/ge_attr_define.cc +++ b/src/common/graph/ge_attr_define.cc @@ -157,6 +157,7 @@ const std::string ATTR_NAME_WEIGHTS_DATA = "weights_data"; const std::string ATTR_NAME_BROACAST_REAL_DIM_CNT = "broacast_real_dim_cnt"; const std::string ATTR_NAME_DIM_ALIGN = "dim_align"; const std::string ATTR_NAME_STREAM_LABEL = "_stream_label"; +const std::string ATTR_NAME_RTS_LABEL_NODE = "_rts_label_node"; const std::string ATTR_NAME_CONTINUOUS_STREAM_LABEL = "_continuous_stream_label"; const std::string ATTR_NAME_STREAM_CYCLE_EVENT_FLAG = "need_stream_cycle_event"; const std::string ATTR_NAME_RTSWITCH_RECV_EVENT_ID = "rtswitch_event_id"; diff --git a/src/common/graph/gnode.cc b/src/common/graph/gnode.cc index 33450a5c..9c9fa195 100644 --- a/src/common/graph/gnode.cc +++ b/src/common/graph/gnode.cc @@ -25,6 +25,7 @@ #include "graph/utils/tensor_adapter.h" #include #include "graph/debug/ge_attr_define.h" +#include "graph/debug/ge_op_types.h" #include "utils/node_utils.h" #include "utils/op_desc_utils.h" @@ -264,20 +265,34 @@ graphStatus GNode::GetInputConstData(const int32_t index, Tensor &data) const { } NodePtr input_data_node = NodeUtils::GetInDataNodeByIndex(*node_ptr, index); - bool is_const = NodeUtils::IsConst(*input_data_node); - if (!is_const) { - GELOGE(GRAPH_NODE_WITHOUT_CONST_INPUT, "Node[%s] has no const input.", node_ptr->GetName().c_str()); - return GRAPH_NODE_WITHOUT_CONST_INPUT; - } - - Operator const_op = OpDescUtils::CreateOperatorFromNode(input_data_node); - if (const_op.GetAttr(ATTR_NAME_WEIGHTS, data) != GRAPH_SUCCESS) { - GELOGE(GRAPH_FAILED, "Input data node[%s] of node[%s] get data failed.", input_data_node->GetName().c_str(), - node_ptr->GetName().c_str()); - return GRAPH_FAILED; + GE_CHECK_NOTNULL(input_data_node); + string op_type = input_data_node->GetType(); + if (op_type == CONSTANT || op_type == CONSTANTOP) { + Operator const_op = OpDescUtils::CreateOperatorFromNode(input_data_node); + if (const_op.GetAttr(ATTR_NAME_WEIGHTS, data) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Input data node[%s] of node[%s] get data failed.", input_data_node->GetName().c_str(), + node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + return SUCCESS; + } else if (op_type == DATA) { + auto parent_node = NodeUtils::GetParentInput(input_data_node); + while ((parent_node != nullptr) && (parent_node->GetType() == DATA)) { + parent_node = NodeUtils::GetParentInput(parent_node); + } + if ((parent_node != nullptr) && ((parent_node->GetType() == CONSTANT) || (parent_node->GetType() == CONSTANTOP))) { + Operator const_op = OpDescUtils::CreateOperatorFromNode(parent_node); + if (const_op.GetAttr(ATTR_NAME_WEIGHTS, data) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Input data node[%s] of node[%s] get data failed.", parent_node->GetName().c_str(), + node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + return GRAPH_SUCCESS; + } } - return GRAPH_SUCCESS; + GELOGE(GRAPH_NODE_WITHOUT_CONST_INPUT, "Node[%s] has no const input.", node_ptr->GetName().c_str()); + return GRAPH_NODE_WITHOUT_CONST_INPUT; } graphStatus GNode::GetInputIndexByName(const ge::AscendString &name, int32_t &index) { @@ -793,7 +808,7 @@ bool GNode::HasAttr(const ge::AscendString &name) { return true; } -graphStatus GNode::GetSubgraph(uint32_t index, GraphPtr graph) const { +graphStatus GNode::GetSubgraph(uint32_t index, GraphPtr &graph) const { if (impl_ == nullptr) { GELOGE(GRAPH_FAILED, "GetSubgraph: node impl is nullptr."); return GRAPH_FAILED; @@ -807,20 +822,20 @@ graphStatus GNode::GetSubgraph(uint32_t index, GraphPtr graph) const { ComputeGraphPtr compute_graph_ptr = NodeUtils::GetSubgraph(*node_ptr, index); if (compute_graph_ptr == nullptr) { - GELOGE(GRAPH_FAILED, "GetSubgraph: get subgraph[%u] failed form node[%s].", index, node_ptr->GetName().c_str()); + GELOGE(GRAPH_FAILED, "GetSubgraph: get subgraph[%u] failed from node[%s].", index, node_ptr->GetName().c_str()); return GRAPH_FAILED; } - Graph create_graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph_ptr); - graph = std::make_shared(create_graph); + + graph = GraphUtils::CreateGraphPtrFromComputeGraph(compute_graph_ptr); if (graph == nullptr) { - GELOGE(GRAPH_FAILED, "GetSubgraph: graph make shared failed form node[%s].", node_ptr->GetName().c_str()); + GELOGE(GRAPH_FAILED, "GetSubgraph: get subgraph[%u] failed from node[%s].", index, node_ptr->GetName().c_str()); return GRAPH_FAILED; } return GRAPH_SUCCESS; } -graphStatus GNode::GetALLSubgraphs(std::vector graph_list) const { +graphStatus GNode::GetALLSubgraphs(std::vector &graph_list) const { if (impl_ == nullptr) { GELOGE(GRAPH_FAILED, "GetALLSubgraphs: node impl is nullptr."); return GRAPH_FAILED; @@ -834,24 +849,27 @@ graphStatus GNode::GetALLSubgraphs(std::vector graph_list) const { std::vector sub_graphs = NodeUtils::GetAllSubgraphs(*node_ptr); if (sub_graphs.empty()) { - GELOGE(GRAPH_FAILED, "GetALLSubgraphs: get all subgraphs failed form node[%s].", node_ptr->GetName().c_str()); + GELOGE(GRAPH_FAILED, "GetALLSubgraphs: get all subgraphs failed from node[%s].", node_ptr->GetName().c_str()); return GRAPH_FAILED; } for (auto &sub_graph : sub_graphs) { if (sub_graph == nullptr) { - GELOGE(GRAPH_FAILED, "Get subgraph failed form node[%s].", node_ptr->GetName().c_str()); + GELOGE(GRAPH_FAILED, "Get subgraph failed from node[%s].", node_ptr->GetName().c_str()); return GRAPH_FAILED; } - Graph create_graph = GraphUtils::CreateGraphFromComputeGraph(sub_graph); - GraphPtr graph = std::make_shared(create_graph); + GraphPtr graph = GraphUtils::CreateGraphPtrFromComputeGraph(sub_graph); if (graph == nullptr) { - GELOGE(GRAPH_FAILED, "Subgraph make shared failed form node[%s].", node_ptr->GetName().c_str()); + GELOGE(GRAPH_FAILED, "Subgraph create compute graph failed from node[%s].", node_ptr->GetName().c_str()); return GRAPH_FAILED; } graph_list.emplace_back(graph); } + if (graph_list.empty()) { + GELOGW("Node[%s] has no subgraph.", node_ptr->GetName().c_str()); + } + return GRAPH_SUCCESS; } } // namespace ge diff --git a/src/common/graph/graph.cc b/src/common/graph/graph.cc index 181b38d1..f531588e 100644 --- a/src/common/graph/graph.cc +++ b/src/common/graph/graph.cc @@ -24,6 +24,7 @@ #include "graph/utils/graph_utils.h" #include "graph/utils/op_desc_utils.h" #include "graph/utils/node_adapter.h" +#include "graph/utils/node_utils.h" using std::map; using std::pair; @@ -246,6 +247,53 @@ class GraphImpl { ComputeGraphPtr GetComputeGraph() const { return compute_graph_; } + graphStatus RemoveEdge(NodePtr &src_node_ptr, const int32_t src_port_index, NodePtr &dst_node_ptr, + const int32_t dst_port_index) { + GE_CHECK_NOTNULL(src_node_ptr); + GE_CHECK_NOTNULL(dst_node_ptr); + + graphStatus res = GRAPH_FAILED; + if ((src_port_index == -1) && (dst_port_index == -1)) { + if (src_node_ptr->GetOutControlAnchor() == nullptr) { + GELOGE(GRAPH_FAILED, "RemoveEdge: src node[%s] out control anchor is null.", src_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + res = GraphUtils::RemoveEdge(src_node_ptr->GetOutControlAnchor(), dst_node_ptr->GetInControlAnchor()); + if (res != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "RemoveEdge: remove control edge between [%s] and [%s]failed.", + src_node_ptr->GetName().c_str(), dst_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + return GRAPH_SUCCESS; + } + + if (src_node_ptr->GetOutDataAnchor(src_port_index) == nullptr) { + GELOGE(GRAPH_FAILED, "RemoveEdge: src node[%s] out data anchor[%d] is null.", src_node_ptr->GetName().c_str(), + src_port_index); + return GRAPH_FAILED; + } + + if (src_port_index != -1 && dst_port_index == -1) { + res = GraphUtils::RemoveEdge(src_node_ptr->GetOutDataAnchor(src_port_index), dst_node_ptr->GetInControlAnchor()); + if (res != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "RemoveEdge: remove data-control edge between [%s] and [%s]failed.", + src_node_ptr->GetName().c_str(), dst_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + return GRAPH_SUCCESS; + } + + res = GraphUtils::RemoveEdge(src_node_ptr->GetOutDataAnchor(src_port_index), + dst_node_ptr->GetInDataAnchor(dst_port_index)); + if (res != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "RemoveEdge: remove data edge between [%s] and [%s] failed.", + src_node_ptr->GetName().c_str(), dst_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + + return GRAPH_SUCCESS; + } + private: std::string name_; std::string output_name_; @@ -392,17 +440,25 @@ graphStatus Graph::RemoveNode(GNode &node) { return GRAPH_FAILED; } + if (node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "RemoveNode: node[%s] is invalid.", node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + ComputeGraphPtr compute_graph_ptr = impl_->GetComputeGraph(); if (compute_graph_ptr == nullptr) { GELOGE(GRAPH_FAILED, "RemoveNde: compute graph ptr is nullptr."); return GRAPH_FAILED; } - if (compute_graph_ptr->RemoveNode(node_ptr) != GRAPH_SUCCESS) { - GELOGE(GRAPH_FAILED, "RemoveNde: remove node failed."); + ge::NodeUtils::UnlinkAll(*node_ptr); + if (GraphUtils::RemoveNodeWithoutRelink(compute_graph_ptr, node_ptr) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "RemoveNode: remove node[%s] failed.", node_ptr->GetName().c_str()); return GRAPH_FAILED; } + node_ptr->SetAnyOwnerComputeGraph(nullptr); + return GRAPH_SUCCESS; } @@ -430,31 +486,21 @@ graphStatus Graph::RemoveEdge(GNode &src_node, const int32_t src_port_index, GNo return GRAPH_FAILED; } - graphStatus res = GRAPH_FAILED; - if ((src_port_index == -1) && (dst_port_index == -1)) { - res = GraphUtils::RemoveEdge(src_node_ptr->GetOutControlAnchor(), dst_node_ptr->GetInControlAnchor()); - if (res != GRAPH_SUCCESS) { - GELOGE(GRAPH_FAILED, "RemoveEdge: remove control edge failed."); - return GRAPH_FAILED; - } - return GRAPH_SUCCESS; + if (src_node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "RemoveEdge: src node[%s] is invalid.", src_node_ptr->GetName().c_str()); + return GRAPH_FAILED; } - if (src_port_index != -1 && dst_port_index == -1) { - res = GraphUtils::RemoveEdge(src_node_ptr->GetOutDataAnchor(src_port_index), dst_node_ptr->GetInControlAnchor()); - if (res != GRAPH_SUCCESS) { - GELOGE(GRAPH_FAILED, "RemoveEdge: remove data-control edge failed."); - return GRAPH_FAILED; - } - return GRAPH_SUCCESS; + if (dst_node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "RemoveEdge: dst node[%s] is invalid.", dst_node_ptr->GetName().c_str()); + return GRAPH_FAILED; } - res = GraphUtils::RemoveEdge(src_node_ptr->GetOutDataAnchor(src_port_index), - dst_node_ptr->GetInDataAnchor(dst_port_index)); - if (res != GRAPH_SUCCESS) { - GELOGE(GRAPH_FAILED, "RemoveEdge: remove data edge failed."); + if (impl_->RemoveEdge(src_node_ptr, src_port_index, dst_node_ptr, dst_port_index) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "RemoveEdge: remove edge failed."); return GRAPH_FAILED; } + return GRAPH_SUCCESS; } @@ -501,6 +547,16 @@ graphStatus Graph::AddDataEdge(GNode &src_node, const int32_t src_port_index, GN return GRAPH_FAILED; } + if (src_node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "AddDataEdge: src node[%s] is invalid.", src_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + + if (dst_node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "AddDataEdge: dst node[%s] is invalid.", dst_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + graphStatus res = GraphUtils::AddEdge(src_node_ptr->GetOutDataAnchor(src_port_index), dst_node_ptr->GetInDataAnchor(dst_port_index)); if (res != GRAPH_SUCCESS) { @@ -529,6 +585,16 @@ graphStatus Graph::AddControlEdge(GNode &src_node, GNode &dst_node) { return GRAPH_FAILED; } + if (src_node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "AddControlEdge: src node[%s] is invalid.", src_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + + if (dst_node_ptr->GetOwnerComputeGraph() == nullptr) { + GELOGE(GRAPH_FAILED, "AddControlEdge: dst node[%s] is invalid.", dst_node_ptr->GetName().c_str()); + return GRAPH_FAILED; + } + graphStatus res = GraphUtils::AddEdge(src_node_ptr->GetOutControlAnchor(), dst_node_ptr->GetInControlAnchor()); if (res != GRAPH_SUCCESS) { GELOGE(GRAPH_FAILED, "AddControlEdge: Add control edge failed."); @@ -558,10 +624,9 @@ GraphPtr Graph::ConstructFromInputs(const std::vector &inputs, const g } compute_graph->SetInputSize(static_cast(inputs.size())); - Graph graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph); - GraphPtr graph_ptr = std::make_shared(graph); + GraphPtr graph_ptr = GraphUtils::CreateGraphPtrFromComputeGraph(compute_graph); if (graph_ptr == nullptr) { - GELOGE(GRAPH_FAILED, "ConstructFromInputs: graph make shared failed."); + GELOGE(GRAPH_FAILED, "ConstructFromInputs: create graph from compute graph failed."); return nullptr; } @@ -604,6 +669,20 @@ GraphUtils::CreateGraphFromComputeGraph(const ge::ComputeGraphPtr compute_graph) return graph; } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GraphPtr +GraphUtils::CreateGraphPtrFromComputeGraph(const ge::ComputeGraphPtr compute_graph) { + GE_CHK_BOOL_EXEC_NOLOG(compute_graph != nullptr, return nullptr); + + auto name = compute_graph->GetName(); + auto graph = ComGraphMakeShared(name); + GE_CHK_BOOL_EXEC_NOLOG(graph != nullptr, return nullptr); + GE_CHK_BOOL_EXEC_NOLOG(graph->impl_ != nullptr, return nullptr); + + graph->impl_->compute_graph_ = compute_graph; + + return graph; +} + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::RecoverGraphOperators(const Graph &graph) { GE_CHECK_NOTNULL(graph.impl_); GE_CHECK_NOTNULL(graph.impl_->compute_graph_); diff --git a/src/common/graph/node.cc b/src/common/graph/node.cc index d33c6008..a9dde91b 100644 --- a/src/common/graph/node.cc +++ b/src/common/graph/node.cc @@ -393,6 +393,11 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::SetOwnerCompute return GRAPH_SUCCESS; } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::SetAnyOwnerComputeGraph(const ComputeGraphPtr &graph) { + owner_graph_ = graph; + return GRAPH_SUCCESS; +} + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Node::Vistor Node::GetAllInDataAnchors() const { return Vistor(shared_from_this(), in_data_anchors_); } diff --git a/src/common/graph/utils/tensor_utils.cc b/src/common/graph/utils/tensor_utils.cc index 26ac8cc8..dba3e4c5 100644 --- a/src/common/graph/utils/tensor_utils.cc +++ b/src/common/graph/utils/tensor_utils.cc @@ -292,6 +292,8 @@ static graphStatus CalcTensorElementCnt(const std::vector &dims, Format graph_status = CalcElementCntByDims(dims, element_cnt); break; default: + ErrorManager::GetInstance().ATCReportErrMessage( + "E19012", {"function", "reason"}, {"CalcTensorElementCnt", "format[" + format_str + "] is not support"}); GELOGE(GRAPH_FAILED, "unsupported format, format=%d(%s).", format, format_str.c_str()); graph_status = GRAPH_FAILED; break; diff --git a/src/common/graph/utils/type_utils.cc b/src/common/graph/utils/type_utils.cc index 2efc530e..8758462a 100644 --- a/src/common/graph/utils/type_utils.cc +++ b/src/common/graph/utils/type_utils.cc @@ -16,6 +16,7 @@ #include "graph/utils/type_utils.h" #include "debug/ge_util.h" +#include "common/util/error_manager/error_manager.h" using domi::domiTensorFormat_t; @@ -431,6 +432,9 @@ bool TypeUtils::GetDataTypeLength(ge::DataType data_type, uint32_t &length) { length = it->second; return true; } else { + ErrorManager::GetInstance().ATCReportErrMessage( + "E19012", {"function", "reason"}, + {"GetDataTypeLength", "data_type[" + std::to_string(data_type) + "] is not support"}); GELOGE(GRAPH_FAILED, "data_type not support %d", data_type); return false; } diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt index b05bb6ce..4faba8db 100755 --- a/src/ge/CMakeLists.txt +++ b/src/ge/CMakeLists.txt @@ -96,6 +96,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc" "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc" "graph/load/new_model_manager/task_info/memcpy_async_task_info.cc" + "graph/load/new_model_manager/task_info/model_exit_task_info.cc" "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "graph/load/new_model_manager/task_info/stream_active_task_info.cc" "graph/load/new_model_manager/task_info/stream_switch_task_info.cc" @@ -277,6 +278,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc" "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc" "graph/load/new_model_manager/task_info/memcpy_async_task_info.cc" + "graph/load/new_model_manager/task_info/model_exit_task_info.cc" "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "graph/load/new_model_manager/task_info/stream_active_task_info.cc" "graph/load/new_model_manager/task_info/stream_switch_task_info.cc" diff --git a/src/ge/common/types.cc b/src/ge/common/types.cc index ecda37e2..220923c5 100644 --- a/src/ge/common/types.cc +++ b/src/ge/common/types.cc @@ -398,6 +398,7 @@ REGISTER_OPTYPE_DEFINE(MEMCPYASYNC, "MemcpyAsync"); REGISTER_OPTYPE_DEFINE(MEMCPYADDRASYNC, "MemcpyAddrAsync"); REGISTER_OPTYPE_DEFINE(STREAMMERGE, "StreamMerge"); REGISTER_OPTYPE_DEFINE(ENDGRAPH, "EndGraph"); +REGISTER_OPTYPE_DEFINE(MODELEXIT, "ModelExit"); REGISTER_OPTYPE_DEFINE(SEND, "Send"); REGISTER_OPTYPE_DEFINE(RECV, "Recv"); REGISTER_OPTYPE_DEFINE(ENDOFSEQUENCE, "EndOfSequence"); diff --git a/src/ge/executor/ge_executor.cc b/src/ge/executor/ge_executor.cc index 9c116fee..e642f41c 100644 --- a/src/ge/executor/ge_executor.cc +++ b/src/ge/executor/ge_executor.cc @@ -1056,6 +1056,7 @@ ge::Status GeExecutor::ExecuteAsync(DynamicSingleOp *executor, const vectorClearAicpuSo(); return SingleOpManager::GetInstance().ReleaseResource(stream); } diff --git a/src/ge/executor/module.mk b/src/ge/executor/module.mk index c6831077..309feb10 100644 --- a/src/ge/executor/module.mk +++ b/src/ge/executor/module.mk @@ -48,6 +48,7 @@ local_ge_executor_src_files := \ ../graph/load/new_model_manager/task_info/stream_switch_task_info.cc \ ../graph/load/new_model_manager/task_info/stream_switchn_task_info.cc \ ../graph/load/new_model_manager/task_info/end_graph_task_info.cc \ + ../graph/load/new_model_manager/task_info/model_exit_task_info.cc \ ../graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc \ ../graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc \ ../opskernel_manager/ops_kernel_builder_manager.cc \ diff --git a/src/ge/ge_inference.mk b/src/ge/ge_inference.mk index ac106346..a33174e5 100644 --- a/src/ge/ge_inference.mk +++ b/src/ge/ge_inference.mk @@ -109,6 +109,7 @@ OMG_HOST_SRC_FILES := \ graph/passes/atomic_addr_clean_pass.cc \ graph/passes/mark_same_addr_pass.cc \ graph/passes/mark_graph_unknown_status_pass.cc \ + graph/passes/mark_agnostic_pass.cc \ graph/common/omg_util.cc \ graph/common/bcast.cc \ graph/common/local_context.cc \ @@ -176,6 +177,7 @@ OMG_HOST_SRC_FILES := \ graph/passes/cast_translate_pass.cc \ graph/passes/prune_pass.cc \ graph/passes/merge_to_stream_merge_pass.cc \ + graph/passes/merge_input_memcpy_pass.cc \ graph/passes/switch_to_stream_switch_pass.cc \ graph/passes/attach_stream_label_pass.cc \ graph/passes/multi_batch_pass.cc \ @@ -247,6 +249,7 @@ OME_HOST_SRC_FILES := \ graph/load/new_model_manager/task_info/stream_switch_task_info.cc \ graph/load/new_model_manager/task_info/stream_switchn_task_info.cc \ graph/load/new_model_manager/task_info/end_graph_task_info.cc \ + graph/load/new_model_manager/task_info/model_exit_task_info.cc \ graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc \ graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc \ single_op/task/op_task.cc \ diff --git a/src/ge/ge_runner.mk b/src/ge/ge_runner.mk index 6c448a46..270e632f 100644 --- a/src/ge/ge_runner.mk +++ b/src/ge/ge_runner.mk @@ -61,6 +61,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/load/new_model_manager/model_utils.cc \ graph/load/new_model_manager/aipp_utils.cc \ graph/load/new_model_manager/task_info/end_graph_task_info.cc \ + graph/load/new_model_manager/task_info/model_exit_task_info.cc \ graph/load/new_model_manager/task_info/event_record_task_info.cc \ graph/load/new_model_manager/task_info/event_wait_task_info.cc \ graph/load/new_model_manager/task_info/fusion_start_task_info.cc \ @@ -110,6 +111,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/passes/atomic_addr_clean_pass.cc \ graph/passes/mark_same_addr_pass.cc \ graph/passes/mark_graph_unknown_status_pass.cc \ + graph/passes/mark_agnostic_pass.cc \ graph/partition/dynamic_shape_partition.cc \ graph/partition/stage_partition.cc \ graph/passes/base_pass.cc \ @@ -210,6 +212,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/passes/switch_data_edges_bypass.cc \ graph/passes/switch_logic_remove_pass.cc \ graph/passes/merge_to_stream_merge_pass.cc \ + graph/passes/merge_input_memcpy_pass.cc \ graph/passes/switch_to_stream_switch_pass.cc \ graph/passes/attach_stream_label_pass.cc \ graph/passes/switch_dead_branch_elimination.cc \ diff --git a/src/ge/graph/build/logical_stream_allocator.cc b/src/ge/graph/build/logical_stream_allocator.cc index d1866584..4d113f7e 100644 --- a/src/ge/graph/build/logical_stream_allocator.cc +++ b/src/ge/graph/build/logical_stream_allocator.cc @@ -462,8 +462,7 @@ Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vector all_reduce_succs; for (const NodePtr &node : graph->GetDirectNode()) { - if ((node->GetType() != HCOMALLREDUCE && node->GetType() != HVDCALLBACKALLREDUCE) || - node->GetInDataNodes().size() <= 1) { + if (!IsHcomNode(node->GetType()) || node->GetInDataNodes().size() <= 1) { continue; } @@ -507,14 +506,20 @@ Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vectorGetName().c_str(), old_stream, new_stream); - node->GetOpDesc()->SetStreamId(new_stream); + if (!IsHcomNode(node->GetType())) { + GELOGI("Stream of node %s has been updated from %ld to %ld.", node->GetName().c_str(), old_stream, new_stream); + node->GetOpDesc()->SetStreamId(new_stream); + } } } return !all_reduce_succs.empty() ? SUCCESS : NOT_CHANGED; } +bool AllReduceParallelPass::IsHcomNode(const std::string &node_type) { + return (node_type == HCOMALLREDUCE || node_type == HVDCALLBACKALLREDUCE); +} + LogicalStreamAllocator::LogicalStreamAllocator(const map &scheduler_confs, const map &max_parallel_num) : scheduler_confs_(scheduler_confs), max_parallel_num_(max_parallel_num) {} diff --git a/src/ge/graph/build/logical_stream_allocator.h b/src/ge/graph/build/logical_stream_allocator.h index 280a4104..d8f677f8 100644 --- a/src/ge/graph/build/logical_stream_allocator.h +++ b/src/ge/graph/build/logical_stream_allocator.h @@ -166,6 +166,9 @@ class AllReduceParallelPass : public LogicalStreamPass { public: STREAM_PASS_DEFAULT_FUNC(AllReduceParallelPass); Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; + + private: + bool IsHcomNode(const std::string &node_type); }; // Assign logical streams which is not limited by the number of tasks. diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc index 19138b90..c09f3ba7 100644 --- a/src/ge/graph/build/memory/block_mem_assigner.cc +++ b/src/ge/graph/build/memory/block_mem_assigner.cc @@ -870,9 +870,11 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, string ge_disable_reuse_mem_env = "0"; (void)ge::GetContext().GetOption(OPTION_EXEC_DISABLE_REUSED_MEMORY, ge_disable_reuse_mem_env); if (ge_disable_reuse_mem_env != "1") { - bool reuse_mem_flag = !((workspace_reuse_flag.size() > out_index) && !workspace_reuse_flag[out_index]); + bool reuse_mem_flag = (mem_type == kOutput) + ? IsPreReuse(n, out_index) + : !((workspace_reuse_flag.size() > out_index) && !workspace_reuse_flag[out_index]); is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) && !node_op_desc->HasAttr(kOpNoReuseMem) && - reuse_mem_flag && is_op_reuse_mem && (IsPreReuse(n, out_index)); + reuse_mem_flag && is_op_reuse_mem; auto stream_id = node_op_desc->GetStreamId(); if (is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty()) { for (auto it = reusable_blocks_[memory_type][stream_id].begin(); diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc index e4512e4f..b5335d1b 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.cc +++ b/src/ge/graph/load/new_model_manager/davinci_model.cc @@ -464,6 +464,8 @@ Status DavinciModel::DoTaskSink() { GE_CHK_STATUS_RET(InitTaskInfo(*model_task_def.get()), "InitTaskInfo failed."); + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed."); + GE_CHK_STATUS_RET(InitEntryTask(), "InitEntryTask failed."); GE_CHK_STATUS_RET(DistributeTask(), "Distribute failed."); @@ -2051,6 +2053,7 @@ Status DavinciModel::SinkModelProfile() { std::set task_id_set; for (int32_t i = 0; i < task_num; i++) { auto task = task_list_[i]; + GE_CHECK_NOTNULL(task); auto fusion_op_info = task->GetFusionOpInfo(); // when type is RT_MODEL_TASK_KERNEL, ctx is not null if (fusion_op_info != nullptr) { @@ -2077,6 +2080,7 @@ Status DavinciModel::SinkModelProfile() { using Range = std::pair; for (int32_t i = 0; i < task_num; i++) { auto task = task_list_[i]; + GE_CHECK_NOTNULL(task); auto fusion_op_info = task->GetFusionOpInfo(); if (fusion_op_info != nullptr && fusion_op_info->original_op_names.size() > 0) { uint32_t task_id = task->GetTaskID(); diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc index 4a596738..894a3df2 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.cc +++ b/src/ge/graph/load/new_model_manager/model_manager.cc @@ -43,13 +43,18 @@ const std::string kCmdTypeProfInit = "prof_init"; const std::string kCmdTypeProfFinalize = "prof_finalize"; const std::string kCmdTypeProfStart = "prof_start"; const std::string kCmdTypeProfStop = "prof_stop"; -const char *const kLoadOpFromBuf = "loadOpFromBuf"; +const char *const kBatchLoadBuf = "batchLoadsoFrombuf"; +const char *const kDeleteCustOp = "deleteCustOp"; struct CustAicpuSoBuf { uint64_t kernelSoBuf; uint32_t kernelSoBufLen; uint64_t kernelSoName; uint32_t kernelSoNameLen; } __attribute__((packed)); +struct BatchLoadOpFromBufArgs { + uint32_t soNum; + uint64_t args; +} __attribute__((packed)); } // namespace DumpProperties ModelManager::dump_properties_; @@ -236,6 +241,7 @@ ModelManager::~ModelManager() { std::lock_guard lock(map_mutex_); model_map_.clear(); model_aicpu_kernel_.clear(); + cust_aicpu_so_.clear(); GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); } @@ -399,7 +405,6 @@ Status ModelManager::Unload(uint32_t model_id) { } std::lock_guard lock(exeception_infos_mutex_); exception_infos_.clear(); - cust_aicpu_so_.clear(); return SUCCESS; } @@ -1096,64 +1101,149 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) { return SUCCESS; } -Status ModelManager::LoadCustAicpuSo(const OpDescPtr op_desc, string so_name) { +Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name) { + GELOGI("LoadCustAicpuSo in, op name %s, so name %s", op_desc->GetName().c_str(), so_name.c_str()); std::lock_guard lock(cust_aicpu_mutex_); - auto it = cust_aicpu_so_.find(so_name); + CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); + if (aicpu_kernel == nullptr) { + GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str()); + return INTERNAL_ERROR; + } + + // get current context + rtContext_t rt_cur_ctx = nullptr; + auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); + if (rt_error != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "get current context failed, runtime result is %d", static_cast(rt_error)); + return RT_FAILED; + } + + // use current context as resource key + uintptr_t resource_id = reinterpret_cast(rt_cur_ctx); + auto it = cust_aicpu_so_.find(resource_id); if (it == cust_aicpu_so_.end()) { - GE_CHK_STATUS_RET(LaunchCustAicpuSo(op_desc, so_name), "LaunchCustAicpuSo failed. op name %s, so_name %s", - op_desc->GetName().c_str(), so_name.c_str()); - (void)cust_aicpu_so_.insert(so_name); - GELOGI("LaunchCustAicpuSo op name %s, so_name %s.", op_desc->GetName().c_str(), so_name.c_str()); + std::map new_so_name; + new_so_name.insert({so_name, aicpu_kernel}); + cust_aicpu_so_[resource_id] = new_so_name; + GELOGI("LoadCustAicpuSo new aicpu so resource id %lu", resource_id); + return SUCCESS; + } + auto it_so_name = it->second.find(so_name); + if (it_so_name == it->second.end()) { + it->second.insert({so_name, aicpu_kernel}); + GELOGI("LoadCustAicpuSo add aicpu so resource id %lu", resource_id); } return SUCCESS; } -Status ModelManager::LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name) { - CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); - if (aicpu_kernel == nullptr) { - GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str()); - return INTERNAL_ERROR; +Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { + GELOGI("LaunchCustAucpuSo in, kernel name %s", kernel_name.c_str()); + std::lock_guard lock(cust_aicpu_mutex_); + if (cust_aicpu_so_.size() == 0) return SUCCESS; + // get current context + rtContext_t rt_cur_ctx = nullptr; + auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); + if (rt_error != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "get current context failed, runtime result is %d", static_cast(rt_error)); + return RT_FAILED; + } + uintptr_t resource_id = reinterpret_cast(rt_cur_ctx); + auto it = cust_aicpu_so_.find(resource_id); + if (it == cust_aicpu_so_.end()) { + GELOGI("Cust aicpu so map is empty, context id %lu", resource_id); + return SUCCESS; } - const void *aicpu_data = aicpu_kernel->GetBinData(); - uint32_t aicpu_data_length = aicpu_kernel->GetBinDataSize(); - void *d_aicpu_data = nullptr; - void *d_so_name = nullptr; - void *args = nullptr; + vector allocated_mem; rtError_t status; rtStream_t stream = nullptr; - GE_CHK_RT(rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM)); - GE_CHK_RT(rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE)); - GE_CHK_RT(rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM)); - GE_CHK_RT(rtMemcpy(d_so_name, so_name.size(), reinterpret_cast(so_name.c_str()), so_name.size(), - RT_MEMCPY_HOST_TO_DEVICE)); + vector v_cust_so; + void *args = nullptr; - CustAicpuSoBuf cust_aicpu_so_buf; - cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast(reinterpret_cast(d_aicpu_data)); - cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; - cust_aicpu_so_buf.kernelSoName = reinterpret_cast(reinterpret_cast(d_so_name)); - cust_aicpu_so_buf.kernelSoNameLen = so_name.size(); + for (const auto &it_so : it->second) { + const void *aicpu_data = it_so.second->GetBinData(); + uint32_t aicpu_data_length = it_so.second->GetBinDataSize(); + string so_name = it_so.first; + void *d_aicpu_data = nullptr; + void *d_so_name = nullptr; + + status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + allocated_mem.push_back(d_aicpu_data); + status = rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + allocated_mem.push_back(d_so_name); + GE_CHK_RT(rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE)); + GE_CHK_RT(rtMemcpy(d_so_name, so_name.size(), reinterpret_cast(so_name.c_str()), so_name.size(), + RT_MEMCPY_HOST_TO_DEVICE)); + + CustAicpuSoBuf cust_aicpu_so_buf; + cust_aicpu_so_buf.kernelSoBuf = reinterpret_cast(reinterpret_cast(d_aicpu_data)); + cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; + cust_aicpu_so_buf.kernelSoName = reinterpret_cast(reinterpret_cast(d_so_name)); + cust_aicpu_so_buf.kernelSoNameLen = so_name.size(); + v_cust_so.push_back(cust_aicpu_so_buf); + } + if (kernel_name == kDeleteCustOp) { + (void)cust_aicpu_so_.erase(it); + } + + uint32_t args_size = sizeof(CustAicpuSoBuf) * v_cust_so.size(); + status = rtMalloc(&args, args_size, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + allocated_mem.push_back(args); + GE_CHK_RT(rtMemcpy(args, args_size, v_cust_so.data(), args_size, RT_MEMCPY_HOST_TO_DEVICE)); + + BatchLoadOpFromBufArgs batch_cust_so; + batch_cust_so.soNum = v_cust_so.size(); + batch_cust_so.args = reinterpret_cast(reinterpret_cast(args)); + + void *batch_args = nullptr; + uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); + status = rtMalloc(&batch_args, batch_args_size, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + return RT_ERROR_TO_GE_STATUS(status); + } + allocated_mem.push_back(batch_args); + GE_CHK_RT(rtMemcpy(batch_args, batch_args_size, static_cast(&batch_cust_so), batch_args_size, + RT_MEMCPY_HOST_TO_DEVICE)); - uint32_t args_size = sizeof(CustAicpuSoBuf); - GE_CHK_RT(rtMalloc(&args, args_size, RT_MEMORY_HBM)); - GE_CHK_RT(rtMemcpy(args, args_size, static_cast(&cust_aicpu_so_buf), args_size, RT_MEMCPY_HOST_TO_DEVICE)); GE_CHK_RT(rtStreamCreate(&stream, 0)); - GE_CHK_RT(rtCpuKernelLaunch(nullptr, kLoadOpFromBuf, 1, args, args_size, nullptr, stream)); + GE_CHK_RT(rtCpuKernelLaunch(nullptr, kernel_name.c_str(), 1, batch_args, batch_args_size, nullptr, stream)); status = rtStreamSynchronize(stream); if (status != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); - GE_CHK_RT(rtStreamDestroy(stream)); - GE_CHK_RT(rtFree(args)); - GE_CHK_RT(rtFree(d_aicpu_data)); - GE_CHK_RT(rtFree(d_so_name)); return RT_ERROR_TO_GE_STATUS(status); } - GE_CHK_RT(rtStreamDestroy(stream)); - GE_CHK_RT(rtFree(args)); - GE_CHK_RT(rtFree(d_aicpu_data)); - GE_CHK_RT(rtFree(d_so_name)); - GELOGI("Cpu kernel launch loadOpFromBuf task success."); + std::function callback = [&]() { + for (auto mem : allocated_mem) { + GE_CHK_RT(rtFree(mem)); + } + GE_CHK_RT(rtStreamDestroy(stream)); + }; + GE_MAKE_GUARD(release, callback); + GELOGI("Cpu kernel launch task success."); + return SUCCESS; +} + +Status ModelManager::ClearAicpuSo() { + GE_CHK_STATUS_RET(LaunchKernelCustAicpuSo(kDeleteCustOp), "delete cust op so failed."); + return SUCCESS; +} + +Status ModelManager::LaunchCustAicpuSo() { + GE_CHK_STATUS_RET(LaunchKernelCustAicpuSo(kBatchLoadBuf), "launch cust op so failed."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/model_manager.h b/src/ge/graph/load/new_model_manager/model_manager.h index 3dce3807..8e23b0ae 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.h +++ b/src/ge/graph/load/new_model_manager/model_manager.h @@ -270,9 +270,13 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { ge::Status DestroyAicpuSessionForInfer(uint32_t model_id); - ge::Status LoadCustAicpuSo(const OpDescPtr op_desc, string so_name); + ge::Status LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name); - ge::Status LaunchCustAicpuSo(const OpDescPtr op_desc, string so_name); + ge::Status LaunchCustAicpuSo(); + + ge::Status ClearAicpuSo(); + + ge::Status LaunchKernelCustAicpuSo(const string &kernel_name); ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info); @@ -340,7 +344,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::set sess_ids_; std::vector exception_infos_; std::mutex cust_aicpu_mutex_; - std::set cust_aicpu_so_; + std::map> cust_aicpu_so_; static DumpProperties dump_properties_; }; diff --git a/src/ge/graph/load/new_model_manager/model_utils.cc b/src/ge/graph/load/new_model_manager/model_utils.cc index a9877d89..75917e0e 100644 --- a/src/ge/graph/load/new_model_manager/model_utils.cc +++ b/src/ge/graph/load/new_model_manager/model_utils.cc @@ -479,13 +479,15 @@ vector ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_WORKSPACE_TYPE_LIST, workspace_memory_type); for (size_t i = 0; i < v_workspace_bytes.size(); ++i) { // Temporary solution, the aicpu workspace of multiple images cannot be shared. - if (has_workspace_reuse && i < workspace_reuse_flag.size() && !workspace_reuse_flag[i]) { + if (has_workspace_reuse && i < workspace_reuse_flag.size() && !workspace_reuse_flag[i] && + !model_param.is_single_op) { void *mem_addr = model_param.aicpu_mem_mall->Acquire(v_workspace_offset[i], v_workspace_bytes[i]); v_workspace_data_addr.push_back(mem_addr); GELOGI( "[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] aicpu workspace[%zu] offset[%ld] bytes[%ld] " "memaddr[%p]", model_param.graph_id, op_desc->GetName().c_str(), i, v_workspace_offset[i], v_workspace_bytes[i], mem_addr); + continue; } else if (has_mem_type_workspace && workspace_memory_type[i] == RT_MEMORY_P2P_DDR) { int64_t p2p_workspace_offset = v_workspace_offset[i]; int64_t p2p_workspace_bytes = v_workspace_bytes[i]; diff --git a/src/ge/graph/load/new_model_manager/task_info/model_exit_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/model_exit_task_info.cc new file mode 100644 index 00000000..ff8057aa --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/model_exit_task_info.cc @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/load/new_model_manager/task_info/model_exit_task_info.h" + +#include "common/properties_manager.h" +#include "framework/common/debug/ge_log.h" +#include "graph/load/new_model_manager/davinci_model.h" + +namespace ge { +Status ModelExitTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { + GELOGI("InitModelExitTaskInfo Init Start."); + if (davinci_model == nullptr) { + GELOGE(PARAM_INVALID, "davinci_model is null!"); + return PARAM_INVALID; + } + + Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList()); + if (ret != SUCCESS) { + GELOGE(ret, "SetStream fail, stream_id:%u", task_def.stream_id()); + return ret; + } + + model_ = davinci_model->GetRtModelHandle(); + GELOGI("InitModelExitTaskInfo Init Success, model:%p, stream:%p", model_, stream_); + return SUCCESS; +} + +Status ModelExitTaskInfo::Distribute() { + GELOGI("ModelExitTaskInfo Distribute Start."); + rtError_t rt_ret = rtModelExit(model_, stream_); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rtModelExit failed, ret: 0x%x", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + GELOGI("ModelExitTaskInfo Distribute Success."); + return SUCCESS; +} + +REGISTER_TASK_INFO(RT_MODEL_TASK_MODEL_EXIT, ModelExitTaskInfo); +} // namespace ge diff --git a/src/ge/graph/load/new_model_manager/task_info/model_exit_task_info.h b/src/ge/graph/load/new_model_manager/task_info/model_exit_task_info.h new file mode 100644 index 00000000..c219fcc8 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/model_exit_task_info.h @@ -0,0 +1,37 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MODEL_EXIT_TASK_INFO_H_ +#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MODEL_EXIT_TASK_INFO_H_ + +#include "graph/load/new_model_manager/task_info/task_info.h" + +namespace ge { +class ModelExitTaskInfo : public TaskInfo { + public: + ModelExitTaskInfo() {} + + ~ModelExitTaskInfo() override { model_ = nullptr; } + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override; + + Status Distribute() override; + + private: + rtModel_t model_{nullptr}; +}; +} // namespace ge +#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MODEL_EXIT_TASK_INFO_H_ diff --git a/src/ge/graph/load/new_model_manager/task_info/task_info.h b/src/ge/graph/load/new_model_manager/task_info/task_info.h index df35e093..ba2c8828 100644 --- a/src/ge/graph/load/new_model_manager/task_info/task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/task_info.h @@ -56,6 +56,7 @@ struct RuntimeParam { uint32_t label_num = 0; uint64_t session_id = 0; uint32_t graph_id = 0; + bool is_single_op = false; std::unique_ptr ts_mem_mall; std::unique_ptr aicpu_mem_mall; diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc index ee5acd1f..d1cc914c 100644 --- a/src/ge/graph/manager/graph_manager.cc +++ b/src/ge/graph/manager/graph_manager.cc @@ -69,6 +69,7 @@ #include "graph/passes/link_gen_mask_nodes_pass.h" #include "graph/passes/mark_graph_unknown_status_pass.h" #include "graph/passes/merge_pass.h" +#include "graph/passes/merge_input_memcpy_pass.h" #include "graph/passes/merge_to_stream_merge_pass.h" #include "graph/passes/multi_batch_pass.h" #include "graph/passes/next_iteration_pass.h" @@ -1948,6 +1949,8 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { GELOGI("get ge.exec.variable_acc failed. set default value."); } PassManager after_merge_passes; + GE_CHK_STATUS_RET( + after_merge_passes.AddPass("OptimizeStage1_1::MergeInputMemcpyPass", new (std::nothrow) MergeInputMemcpyPass)); GE_CHK_STATUS_RET( after_merge_passes.AddPass("OptimizeStage1_1::SwitchDataEdgesBypass", new (std::nothrow) SwitchDataEdgesBypass)); GE_CHK_STATUS_RET( diff --git a/src/ge/graph/passes/base_pass.cc b/src/ge/graph/passes/base_pass.cc index 4da51ab0..cba279d3 100644 --- a/src/ge/graph/passes/base_pass.cc +++ b/src/ge/graph/passes/base_pass.cc @@ -26,7 +26,7 @@ namespace ge { namespace { -constexpr int kMaxRePassTimes = 1000; +constexpr int kMaxRePassTimes = 10000; constexpr size_t kMaxOneInNodes = 1000; // Each iteration, we take about 0.3k memory on the stack, we should change the recursion to loop later constexpr int kMaxRecursiveDepth = 20; diff --git a/src/ge/graph/passes/flow_ctrl_pass.cc b/src/ge/graph/passes/flow_ctrl_pass.cc index 430cf86d..877f940f 100644 --- a/src/ge/graph/passes/flow_ctrl_pass.cc +++ b/src/ge/graph/passes/flow_ctrl_pass.cc @@ -84,6 +84,22 @@ Status FlowCtrlPass::Run(ComputeGraphPtr compute_graph) { return graph_change ? SUCCESS : NOT_CHANGED; } +bool FlowCtrlPass::CheckMultiDataSet(ComputeGraphPtr &compute_graph) { + int data_set_num = 0; + for (auto &node : compute_graph->GetDirectNode()) { + if (node == nullptr) { + continue; + } + string type; + bool is_found = AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type); + if (is_found && type == "IteratorV2") { + data_set_num++; + } + } + GELOGI("The ComputeGraph contain %d dataSet.", data_set_num); + return (data_set_num > 1) ? true : false; +} + NodePtr FlowCtrlPass::InsertOp(ComputeGraphPtr &compute_graph, const string &node_type, const string &node_name, const std::vector &input_list, const std::vector &output_list) { @@ -312,12 +328,12 @@ Status FlowCtrlPass::CreateIterCtrlFalseBranch(ComputeGraphPtr &compute_graph, c * loopCond * | * v - * switch --> Assign + * switch --> Assign --> active --> ModelExit * ^ * | * loopReset */ - // Insert Assign node + // Insert Assign node and ctrl edge NodePtr assign_node = InsertAssignOp(compute_graph, ASSIGN, NODE_NAME_FLOWCTRL_LOOP_ASSIGN, loop_cond_node, loop_reset_node); if (assign_node == nullptr || switch_node == nullptr) { @@ -327,13 +343,50 @@ Status FlowCtrlPass::CreateIterCtrlFalseBranch(ComputeGraphPtr &compute_graph, c GE_CHK_STATUS_RET(SetStreamLabel(assign_node, switch_node->GetName()), "set stream label failed"); - // 3. Insert ctrl edges graphStatus add_ret = GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), assign_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { GELOGE(FAILED, "Add switch_node to assign_node ctrl edge failed, add_ret=%u.", add_ret); return FAILED; } + if (CheckMultiDataSet(compute_graph)) { + GELOGI("Multi dataSae exist, model_exit node is need."); + // 2. Insert active node and add ctrl edge + string active_name = switch_node->GetName() + "_StreamExitActive"; + NodePtr active_node = InsertOp(compute_graph, STREAMACTIVE, active_name, {}, {}); + if (active_node == nullptr) { + GELOGE(FAILED, "Insert stream active node:%s for IterCtrlTrueStream failed.", active_name.c_str()); + return FAILED; + } + GE_CHK_STATUS_RET(SetStreamLabel(active_node, switch_node->GetName()), "set stream label failed"); + GE_IF_BOOL_EXEC(!AttrUtils::SetBool(active_node->GetOpDesc(), ATTR_NAME_IS_LOOP_ACTIVE, true), + DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); + return FAILED); + + string model_exit_name = switch_node->GetName() + "_ModelExit"; + GE_CHK_STATUS_RET(SetActiveLabelList(active_node, {model_exit_name}), "set active label list failed"); + + add_ret = GraphUtils::AddEdge(assign_node->GetOutControlAnchor(), active_node->GetInControlAnchor()); + if (add_ret != GRAPH_SUCCESS) { + GELOGE(FAILED, "Add assign_node to active_node ctrl edge failed, add_ret=%u.", add_ret); + return FAILED; + } + + // 3. Insert model exit node and add ctrl edge + NodePtr model_exit_node = InsertOp(compute_graph, MODELEXIT, model_exit_name, {}, {}); + if (model_exit_node == nullptr) { + GELOGE(FAILED, "Insert model_exit node:%s for IterCtrlTrueStream failed.", model_exit_name.c_str()); + return FAILED; + } + GE_CHK_STATUS_RET(SetStreamLabel(model_exit_node, model_exit_name), "set stream label failed"); + + add_ret = GraphUtils::AddEdge(active_node->GetOutControlAnchor(), model_exit_node->GetInControlAnchor()); + if (add_ret != GRAPH_SUCCESS) { + GELOGE(FAILED, "Add active_node to model_exit_node ctrl edge failed, add_ret=%u.", add_ret); + return FAILED; + } + } + GELOGI("CreateIterCtrlFalseBranch success."); return SUCCESS; } diff --git a/src/ge/graph/passes/flow_ctrl_pass.h b/src/ge/graph/passes/flow_ctrl_pass.h index a928aaa7..2a6123d6 100644 --- a/src/ge/graph/passes/flow_ctrl_pass.h +++ b/src/ge/graph/passes/flow_ctrl_pass.h @@ -134,6 +134,14 @@ class FlowCtrlPass : public GraphPass { /// Other: failed /// Status AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, NodePtr &loop_after_node); + + /// + /// add special iterator ctrl nodes(small cycle). + /// @param compute_graph graph + /// @return true: two or more dataSet exist + /// false: only one dataSet exist + /// + bool CheckMultiDataSet(ComputeGraphPtr &compute_graph); }; } // namespace ge diff --git a/src/ge/graph/passes/mark_agnostic_pass.cc b/src/ge/graph/passes/mark_agnostic_pass.cc index 6f520dd8..62aa46fd 100644 --- a/src/ge/graph/passes/mark_agnostic_pass.cc +++ b/src/ge/graph/passes/mark_agnostic_pass.cc @@ -16,20 +16,40 @@ #include "graph/passes/mark_agnostic_pass.h" -#include "utils/node_utils.h" +#include "graph/utils/node_utils.h" namespace ge { Status MarkAgnosticPass::Run(ComputeGraphPtr graph) { for (const auto &node : graph->GetDirectNode()) { auto node_type = NodeUtils::GetNodeType(*node); if (node_type == SWITCH || node_type == REFSWITCH || node_type == SWITCHN) { - GELOGD("Mark format agnostic for switch ndoe %s", node->GetName().c_str()); + GELOGD("Mark format agnostic and continuous for switch node %s", node->GetName().c_str()); + const OpDescPtr op_desc = node->GetOpDesc(); + const GeTensorDescPtr op_tensor = op_desc->MutableInputDesc(0); + if (op_tensor == nullptr) { + GELOGD("Op: %s, Index:0,has no input", node->GetName().c_str()); + continue; + } + AttrUtils::SetInt(op_tensor, "_format_continuous", 1); + AttrUtils::SetInt(node->GetOpDesc(), "_format_agnostic", 1); + AttrUtils::SetListInt(node->GetOpDesc(), "_format_agnostic_except_input", std::vector({1})); + continue; + } + if (node_type == IDENTITY) { + GELOGD("Mark format agnostic for identity node %s", node->GetName().c_str()); AttrUtils::SetInt(node->GetOpDesc(), "_format_agnostic", 1); AttrUtils::SetListInt(node->GetOpDesc(), "_format_agnostic_except_input", std::vector({1})); continue; } if (node_type == MERGE || node_type == REFMERGE) { - GELOGD("Mark format agnostic for merge node %s", node->GetName().c_str()); + GELOGD("Mark format agnostic and continuous for merge node %s", node->GetName().c_str()); + const OpDescPtr op_desc = node->GetOpDesc(); + const GeTensorDescPtr op_tensor = op_desc->MutableOutputDesc(0); + if (op_tensor == nullptr) { + GELOGD("Op: %s, Index:0,has no output", node->GetName().c_str()); + continue; + } + AttrUtils::SetInt(op_tensor, "_format_continuous", 1); AttrUtils::SetInt(node->GetOpDesc(), "_format_agnostic", 1); AttrUtils::SetListInt(node->GetOpDesc(), "_format_agnostic_except_output", std::vector({1})); continue; diff --git a/src/ge/graph/passes/merge_input_memcpy_pass.cc b/src/ge/graph/passes/merge_input_memcpy_pass.cc new file mode 100644 index 00000000..51fcd649 --- /dev/null +++ b/src/ge/graph/passes/merge_input_memcpy_pass.cc @@ -0,0 +1,97 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/merge_input_memcpy_pass.h" +#include "common/ge/ge_util.h" +#include "ge/ge_api_types.h" +#include "graph/common/omg_util.h" + +namespace ge { +Status MergeInputMemcpyPass::Run(ComputeGraphPtr graph) { + GELOGD("MergeInputMemcpyPass Enter"); + for (const auto &node : graph->GetDirectNode()) { + if ((node->GetType() != MERGE) && (node->GetType() != REFMERGE)) { + continue; + } + GE_CHECK_NOTNULL(node->GetOpDesc()); + GE_CHK_STATUS_RET(AddMemcpyAsyncNodes(graph, node, node->GetOpDesc()->HasAttr(ATTR_INSERT_BY_MBATCH)), + "Merge add memcpy node failed."); + } + GELOGD("MergeInputMemcpyPass Leave"); + return SUCCESS; +} + +/// +/// @brief Add MemcpyAsync Op as Merge in_node +/// @param [in] graph +/// @param [in] node +/// @param [in] multi_batch_flag +/// @return Status +/// +Status MergeInputMemcpyPass::AddMemcpyAsyncNodes(const ComputeGraphPtr &graph, const NodePtr &node, + bool multi_batch_flag) { + for (const InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { + OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + NodePtr in_node = peer_out_anchor->GetOwnerNode(); + const std::string &type = in_node->GetType(); + // For WhileLoop no need memcpy for merge. + GE_IF_BOOL_EXEC((type == ENTER) || (type == REFENTER) || (type == NEXTITERATION) || (type == REFNEXTITERATION), + continue); + + const std::string &memcpy_name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()); + NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, memcpy_name, peer_out_anchor, multi_batch_flag); + GE_CHK_BOOL_EXEC(memcpy_node != nullptr, return FAILED, "Create MemcpyAsync node failed."); + GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), "MemcpyAsync node remove edge failed."); + GE_CHK_STATUS(GraphUtils::AddEdge(peer_out_anchor, memcpy_node->GetInDataAnchor(0)), + "MemcpyAsync node add edge failed."); + GE_CHK_STATUS(GraphUtils::AddEdge(memcpy_node->GetOutDataAnchor(0), in_data_anchor), + "MemcpyAsync node add edge failed."); + } + + return SUCCESS; +} + +/// +/// @brief Add MemcpyAsync Node +/// @param [in] graph +/// @param [in] name +/// @param [in] out_data_anchor +/// @param [in] multi_batch_flag +/// @return ge::NodePtr +/// +NodePtr MergeInputMemcpyPass::CreateMemcpyAsyncNode(const ComputeGraphPtr &graph, const std::string &name, + const OutDataAnchorPtr &out_data_anchor, bool multi_batch_flag) { + OpDescPtr pre_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc(); + GE_CHK_BOOL_EXEC(pre_op_desc != nullptr, return nullptr, "OpDesc of pre node is invalid."); + + const std::string &memcpy_type = multi_batch_flag ? MEMCPYADDRASYNC : MEMCPYASYNC; + const std::string &node_name = name + "_" + memcpy_type; + GELOGI("Create MemcpyAsync op:%s.", node_name.c_str()); + OpDescPtr op_desc = MakeShared(node_name, memcpy_type); + if (op_desc == nullptr) { + GELOGE(FAILED, "Create op_desc failed, MemcpyAsync:%s.", node_name.c_str()); + return nullptr; + } + + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) == GRAPH_SUCCESS, + return nullptr, "Create MemcpyAsync op: add input desc failed."); + GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) == GRAPH_SUCCESS, + return nullptr, "Create MemcpyAsync op: add output desc failed."); + + return graph->AddNode(op_desc); +} +} // namespace ge diff --git a/src/ge/graph/passes/merge_input_memcpy_pass.h b/src/ge/graph/passes/merge_input_memcpy_pass.h new file mode 100644 index 00000000..b8c6f0b8 --- /dev/null +++ b/src/ge/graph/passes/merge_input_memcpy_pass.h @@ -0,0 +1,49 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_MERGE_ADD_INPUT_MEMCPY_PASS_H_ +#define GE_GRAPH_PASSES_MERGE_ADD_INPUT_MEMCPY_PASS_H_ + +#include "inc/graph_pass.h" + +namespace ge { +class MergeInputMemcpyPass : public GraphPass { + public: + Status Run(ComputeGraphPtr graph); + + private: + /// + /// @brief Add MemcpyAsync Op as Merge in_node + /// @param [in] graph + /// @param [in] node + /// @param [in] multi_batch_flag + /// @return Status + /// + Status AddMemcpyAsyncNodes(const ComputeGraphPtr &graph, const NodePtr &node, bool multi_batch_flag); + + /// + /// @brief Add MemcpyAsync Node + /// @param [in] graph + /// @param [in] name + /// @param [in] out_data_anchor + /// @param [in] multi_batch_flag + /// @return ge::NodePtr + /// + NodePtr CreateMemcpyAsyncNode(const ComputeGraphPtr &graph, const std::string &name, + const OutDataAnchorPtr &out_data_anchor, bool multi_batch_flag); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_MERGE_ADD_INPUT_MEMCPY_PASS_H_ diff --git a/src/ge/graph/passes/merge_to_stream_merge_pass.cc b/src/ge/graph/passes/merge_to_stream_merge_pass.cc index 34daa681..8fcc3ed0 100644 --- a/src/ge/graph/passes/merge_to_stream_merge_pass.cc +++ b/src/ge/graph/passes/merge_to_stream_merge_pass.cc @@ -32,7 +32,7 @@ Status MergeToStreamMergePass::Run(ComputeGraphPtr graph) { OpDescPtr merge_op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(merge_op_desc); if (merge_op_desc->HasAttr(ATTR_INSERT_BY_MBATCH)) { - GE_CHK_STATUS_RET(AddMemcpyAsyncNodes(graph, node, true), "Merge add memcpy node failed."); + GE_CHK_STATUS_RET(AddActiveNodes(graph, node), "Merge add active node failed."); GE_CHK_STATUS_RET(SetStreamLabel(node, node->GetName()), "Set stream label failed"); } else { GE_CHK_STATUS_RET(ReplaceMergeNode(graph, node), "Add StreamMerge node failed."); @@ -99,38 +99,26 @@ Status MergeToStreamMergePass::ReplaceMergeNode(const ComputeGraphPtr &graph, co } } - return AddMemcpyAsyncNodes(graph, stream_merge, false); + return AddActiveNodes(graph, stream_merge); } /// -/// @brief Add MemcpyAsync Op as StreamMerge in_node +/// @brief Add StreamActive Op before StreamMerge/Merge /// @param [in] graph /// @param [in] node -/// @param [in] multi_batch_flag /// @return Status /// -Status MergeToStreamMergePass::AddMemcpyAsyncNodes(const ComputeGraphPtr &graph, const NodePtr &node, - bool multi_batch_flag) { +Status MergeToStreamMergePass::AddActiveNodes(const ComputeGraphPtr &graph, const NodePtr &node) { GE_CHK_BOOL_EXEC(node != nullptr, return FAILED, "Param of pre node is null."); for (const InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); NodePtr in_node = peer_out_anchor->GetOwnerNode(); const std::string &type = in_node->GetType(); - // For WhileLoop no need memcpy & active for merge. + // For WhileLoop, no need to add active nodes here, since which have been added in NextIterationPass. GE_IF_BOOL_EXEC((type == ENTER) || (type == REFENTER) || (type == NEXTITERATION) || (type == REFNEXTITERATION), continue); - - const std::string &memcpy_name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()); - NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, memcpy_name, peer_out_anchor, multi_batch_flag); - GE_CHK_BOOL_EXEC(memcpy_node != nullptr, return FAILED, "Create MemcpyAsync node failed."); - GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), "MemcpyAsync node remove edge failed."); - GE_CHK_STATUS(GraphUtils::AddEdge(peer_out_anchor, memcpy_node->GetInDataAnchor(0)), - "MemcpyAsync node add edge failed."); - GE_CHK_STATUS(GraphUtils::AddEdge(memcpy_node->GetOutDataAnchor(0), in_data_anchor), - "MemcpyAsync node add edge failed."); - - NodePtr active_node = CreateActiveNode(graph, memcpy_node); + NodePtr active_node = CreateActiveNode(graph, in_node); GE_CHK_BOOL_EXEC(active_node != nullptr, return FAILED, "Create StreamActive node failed."); GE_CHK_STATUS(GraphUtils::AddEdge(active_node->GetOutControlAnchor(), node->GetInControlAnchor()), "StreamActive add ctrl edge failed."); @@ -143,37 +131,6 @@ Status MergeToStreamMergePass::AddMemcpyAsyncNodes(const ComputeGraphPtr &graph, return SUCCESS; } -/// -/// @brief Add MemcpyAsync Node -/// @param [in] graph -/// @param [in] name -/// @param [in] out_data_anchor -/// @param [in] multi_batch_flag -/// @return ge::NodePtr -/// -NodePtr MergeToStreamMergePass::CreateMemcpyAsyncNode(const ComputeGraphPtr &graph, const std::string &name, - const OutDataAnchorPtr &out_data_anchor, bool multi_batch_flag) { - GE_CHK_BOOL_EXEC(out_data_anchor != nullptr, return nullptr, "Param of input node is null."); - OpDescPtr pre_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_CHK_BOOL_EXEC(pre_op_desc != nullptr, return nullptr, "OpDesc of pre node is invalid."); - - const std::string &memcpy_type = multi_batch_flag ? MEMCPYADDRASYNC : MEMCPYASYNC; - const std::string &node_name = name + "_" + memcpy_type; - GELOGI("Create MemcpyAsync op:%s.", node_name.c_str()); - OpDescPtr op_desc = MakeShared(node_name, memcpy_type); - if (op_desc == nullptr) { - GELOGE(FAILED, "Create op_desc failed, MemcpyAsync:%s.", node_name.c_str()); - return nullptr; - } - - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) == GRAPH_SUCCESS, - return nullptr, "Create MemcpyAsync op: add input desc failed."); - GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) == GRAPH_SUCCESS, - return nullptr, "Create MemcpyAsync op: add output desc failed."); - - return graph->AddNode(op_desc); -} - /// /// @brief Create Active Op /// @param [in] graph diff --git a/src/ge/graph/passes/merge_to_stream_merge_pass.h b/src/ge/graph/passes/merge_to_stream_merge_pass.h index 9f713989..25cd67f6 100644 --- a/src/ge/graph/passes/merge_to_stream_merge_pass.h +++ b/src/ge/graph/passes/merge_to_stream_merge_pass.h @@ -34,24 +34,12 @@ class MergeToStreamMergePass : public GraphPass { Status ReplaceMergeNode(const ComputeGraphPtr &graph, const NodePtr &merge_node); /// - /// @brief Add MemcpyAsync Op as StreamMerge in_node + /// @brief Add StreamActive Op as StreamMerge in_node /// @param [in] graph /// @param [in] node - /// @param [in] multi_batch_flag /// @return Status /// - Status AddMemcpyAsyncNodes(const ComputeGraphPtr &graph, const NodePtr &node, bool multi_batch_flag); - - /// - /// @brief Add MemcpyAsync Node - /// @param [in] graph - /// @param [in] name - /// @param [in] out_data_anchor - /// @param [in] multi_batch_flag - /// @return ge::NodePtr - /// - NodePtr CreateMemcpyAsyncNode(const ComputeGraphPtr &graph, const std::string &name, - const OutDataAnchorPtr &out_data_anchor, bool multi_batch_flag); + Status AddActiveNodes(const ComputeGraphPtr &graph, const NodePtr &node); /// /// @brief Create Active Op diff --git a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc index 61bca6b8..81aeb831 100644 --- a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc +++ b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc @@ -131,6 +131,14 @@ graphStatus TransOpWithoutReshapeFusionPass::GetSubGraphNodesInfo() { sub_graph_has_reshape_node[i] = true; break; } + if (in_node->GetType() == TRANSPOSE || in_node->GetType() == TRANSPOSED) { + auto input_format = in_node->GetOpDesc()->GetInputDescPtr(0)->GetFormat(); + auto output_format = in_node->GetOpDesc()->GetOutputDescPtr(0)->GetFormat(); + if (input_format == output_format) { + sub_graph_has_reshape_node[i] = true; + break; + } + } auto out_anchor = iter->first; GE_CHECK_NOTNULL(out_anchor); diff --git a/src/ge/graph/passes/transpose_transdata_pass.cc b/src/ge/graph/passes/transpose_transdata_pass.cc index b9bd59be..980eac70 100644 --- a/src/ge/graph/passes/transpose_transdata_pass.cc +++ b/src/ge/graph/passes/transpose_transdata_pass.cc @@ -46,6 +46,14 @@ Status TransposeTransDataPass::Run(NodePtr &node) { if (op_desc->GetType() != TRANSPOSED) { return SUCCESS; } + auto input_format = op_desc->GetInputDescPtr(0)->GetFormat(); + auto output_format = op_desc->GetOutputDescPtr(0)->GetFormat(); + if (input_format == output_format) { + GELOGW("Node %s input format is %s, output format is %s, should not happend. Ignore pass.", + op_desc->GetName().c_str(), TypeUtils::FormatToSerialString(input_format).c_str(), + TypeUtils::FormatToSerialString(output_format).c_str()); + return SUCCESS; + } if (CheckOneInAndOneOutDataAnchor(node) != SUCCESS) { return FAILED; } diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc index 8a6e0bdc..89c852d5 100644 --- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc @@ -184,6 +184,11 @@ Status AippOp::InsertAippToGraph(ComputeGraphPtr &graph, std::string &aippConfig GE_CHECK_NOTNULL(graph); NodePtr target_input = nullptr; std::vector> target_edges; + + if (this->ConvertRelatedInputNameToRank() != SUCCESS) { + GELOGE(FAILED, "AippOp: convert related input name to rank failed."); + return FAILED; + } GE_CHK_STATUS_RET(this->GetTargetPosition(graph, target_input, target_edges), "Get data nodes position failed"); std::map out_anchors_to_aipp; @@ -412,6 +417,38 @@ Status AippOp::GetStaticTargetNode(const ComputeGraphPtr &graph, NodePtr &data_n return SUCCESS; } +Status AippOp::ConvertRelatedInputNameToRank() { + GE_CHECK_NOTNULL(aipp_params_); + + string related_input_name = aipp_params_->related_input_name(); + if (related_input_name.empty()) { + return SUCCESS; + } + + std::vector data_top_names = domi::GetContext().data_top_names; + GELOGI("Convert name to rank start: data size[%zu]", data_top_names.size()); + uint32_t index = 0; + bool convert_flag = false; + for (const auto &data_top_name : data_top_names) { + if (related_input_name == data_top_name) { + aipp_params_->set_related_input_rank(index); + convert_flag = true; + GELOGI("AippOp: rank: %u, top name: %s.", index, data_top_name.c_str()); + break; + } + index++; + } + if (!convert_flag) { + string error_msg = "Top name " + related_input_name + + "convert rank failed, Please" + " ensure top name in aipp config is the top name of data node."; + ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {error_msg}); + GELOGE(PARAM_INVALID, "Top name[%s] converts rank failed.", related_input_name.c_str()); + return PARAM_INVALID; + } + + return SUCCESS; +} Status AippOp::GetTargetPosition(ComputeGraphPtr graph, NodePtr &target_input, std::vector> &target_edges) { diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.h b/src/ge/graph/preprocess/insert_op/ge_aipp_op.h index 64c89b62..0b09c2ad 100644 --- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.h +++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.h @@ -79,6 +79,7 @@ class AippOp : public InsertOpBase { Status AddNodeToGraph(const NodePtr &aipp_node, int64_t max_dynamic_aipp_size); Status AddAippAttrbutes(const OpDescPtr &op_desc, const std::string &aipp_cfg_path, const uint32_t &index); Status AddAttrToAippData(const OpDescPtr &aipp_data_op_desc); + Status ConvertRelatedInputNameToRank(); domi::AippOpParams *aipp_params_ = nullptr; ge::NodePtr aipp_node_ = nullptr; diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc index a2f9c25c..ba0a3e55 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc @@ -115,23 +115,97 @@ void InsertNewOpUtil::ClearNewOps() { } } -Status InsertNewOpUtil::CheckPositionNotRepeat() { +Status InsertNewOpUtil::CheckInputNamePositionNotRepeat() { + for (int i = 0; i < insert_op_conf_->aipp_op_size(); i++) { + const domi::AippOpParams *item = insert_op_conf_->mutable_aipp_op(i); + GE_CHECK_NOTNULL(item); + + for (int j = i + 1; j < insert_op_conf_->aipp_op_size(); j++) { + const domi::AippOpParams *another_item = insert_op_conf_->mutable_aipp_op(j); + GE_CHECK_NOTNULL(another_item); + if (another_item->related_input_name().empty()) { + string error_msg = + "Can not both set related_input_name and related_input_rank!" + " Please ensure param is the same with the first aipp config(related_input_name)."; + ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {error_msg}); + GELOGE(PARAM_INVALID, + "Can not both set related_input_rank and related_input_name!" + " Please ensure param is the same with the first aipp config(related_input_name)."); + return PARAM_INVALID; + } + if (item->related_input_name() == another_item->related_input_name()) { + string error_msg = + "Can not insert aipp to the same postion! Please ensure related_input_name" + " param is different in different aipp config."; + ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {error_msg}); + GELOGE(PARAM_INVALID, + "Can not insert aipp op to the same postion! Please ensure related_input_rank param " + "is different in different aipp config."); + return PARAM_INVALID; + } + } + } + + return SUCCESS; +} + +Status InsertNewOpUtil::CheckInputRankPositionNoRepeat() { for (int i = 0; i < insert_op_conf_->aipp_op_size(); i++) { const domi::AippOpParams *item = insert_op_conf_->mutable_aipp_op(i); + GE_CHECK_NOTNULL(item); for (int j = i + 1; j < insert_op_conf_->aipp_op_size(); j++) { const domi::AippOpParams *another_item = insert_op_conf_->mutable_aipp_op(j); - GE_IF_BOOL_EXEC(item->related_input_rank() == another_item->related_input_rank(), - string errormsg = - "Can not insert aipp to the same postion! Please ensure related_input_rank" - " param is different in different aipp config."; - ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {errormsg}); - GELOGE(PARAM_INVALID, - "Can not insert aipp op to the same postion! Please ensure related_input_rank param " - "is different in different aipp config."); - return PARAM_INVALID;); + GE_CHECK_NOTNULL(another_item); + if (!another_item->related_input_name().empty()) { + string error_msg = + "Can not both set related_input_rank and related_input_name!" + " Please ensure param is the same with the first aipp config(related_input_rank)."; + ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {error_msg}); + GELOGE(PARAM_INVALID, + "Can not both set related_input_rank and related_input_name!" + " Please ensure param is the same with the first aipp config(related_input_rank)."); + return PARAM_INVALID; + } + if (item->related_input_rank() == another_item->related_input_rank()) { + string error_msg = + "Can not insert aipp to the same postion! Please ensure related_input_rank" + " param is different in different aipp config."; + ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {error_msg}); + GELOGE(PARAM_INVALID, + "Can not insert aipp op to the same postion! Please ensure related_input_rank param " + "is different in different aipp config."); + return PARAM_INVALID; + } } } + + return SUCCESS; +} + +Status InsertNewOpUtil::CheckPositionNotRepeat() { + GE_CHECK_NOTNULL(insert_op_conf_); + + if (insert_op_conf_->aipp_op_size() <= 1) { + GELOGI("Aipp op size[%d] less than 2, no need to check position repeat.", insert_op_conf_->aipp_op_size()); + return SUCCESS; + } + + const domi::AippOpParams *item = insert_op_conf_->mutable_aipp_op(0); + GE_CHECK_NOTNULL(item); + + string related_input_name = item->related_input_name(); + Status ret = FAILED; + if (related_input_name.empty()) { + ret = CheckInputRankPositionNoRepeat(); + } else { + ret = CheckInputNamePositionNotRepeat(); + } + if (ret != SUCCESS) { + GELOGE(FAILED, "Check position not repeat failed."); + return FAILED; + } + return SUCCESS; } diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h index ae431c32..1d166096 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h @@ -51,6 +51,10 @@ class InsertNewOpUtil { Status GetAippParams(const std::unique_ptr &aippParams, const ge::NodePtr &aipp_node); + Status CheckInputNamePositionNotRepeat(); + + Status CheckInputRankPositionNoRepeat(); + Status CheckGraph(const ge::ComputeGraphPtr &graph); InsertNewOpUtil() = default; diff --git a/src/ge/host_cpu_engine/CMakeLists.txt b/src/ge/host_cpu_engine/CMakeLists.txt index a1c19eac..63d219d0 100644 --- a/src/ge/host_cpu_engine/CMakeLists.txt +++ b/src/ge/host_cpu_engine/CMakeLists.txt @@ -28,7 +28,6 @@ target_include_directories(host_cpu_engine PRIVATE ${GE_CODE_DIR}/inc ${GE_CODE_DIR}/inc/external ${GE_CODE_DIR}/inc/framework - ${GE_CODE_DIR}/third_party/fwkacllib/inc ${METADEF_DIR}/inc ${METADEF_DIR}/inc/external ${METADEF_DIR}/inc/external/graph @@ -36,6 +35,8 @@ target_include_directories(host_cpu_engine PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc + #### blue zone #### + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(host_cpu_engine PRIVATE @@ -67,7 +68,6 @@ target_include_directories(atc_host_cpu_engine PRIVATE ${GE_CODE_DIR}/inc ${GE_CODE_DIR}/inc/external ${GE_CODE_DIR}/inc/framework - ${GE_CODE_DIR}/third_party/fwkacllib/inc ${METADEF_DIR}/inc ${METADEF_DIR}/inc/external ${METADEF_DIR}/inc/external/graph @@ -75,6 +75,8 @@ target_include_directories(atc_host_cpu_engine PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc + #### blue zone #### + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(atc_host_cpu_engine PRIVATE @@ -107,7 +109,6 @@ target_include_directories(host_cpu_opskernel_builder PRIVATE ${GE_CODE_DIR}/inc ${GE_CODE_DIR}/inc/external ${GE_CODE_DIR}/inc/framework - ${GE_CODE_DIR}/third_party/fwkacllib/inc ${METADEF_DIR}/inc ${METADEF_DIR}/inc/external ${METADEF_DIR}/inc/external/graph @@ -115,6 +116,8 @@ target_include_directories(host_cpu_opskernel_builder PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc + #### blue zone #### + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(host_cpu_opskernel_builder PRIVATE @@ -141,7 +144,6 @@ target_include_directories(atc_host_cpu_opskernel_builder PRIVATE ${GE_CODE_DIR}/inc ${GE_CODE_DIR}/inc/external ${GE_CODE_DIR}/inc/framework - ${GE_CODE_DIR}/third_party/fwkacllib/inc ${METADEF_DIR}/inc ${METADEF_DIR}/inc/external ${METADEF_DIR}/inc/external/graph @@ -149,6 +151,8 @@ target_include_directories(atc_host_cpu_opskernel_builder PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc + #### blue zone #### + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(atc_host_cpu_opskernel_builder PRIVATE @@ -180,7 +184,6 @@ target_include_directories(host_cpu_opskernel_builder_static PRIVATE ${GE_CODE_DIR}/inc ${GE_CODE_DIR}/inc/external ${GE_CODE_DIR}/inc/framework - ${GE_CODE_DIR}/third_party/fwkacllib/inc ${METADEF_DIR}/inc ${METADEF_DIR}/inc/external ${METADEF_DIR}/inc/external/graph @@ -188,6 +191,8 @@ target_include_directories(host_cpu_opskernel_builder_static PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc + #### blue zone #### + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(host_cpu_opskernel_builder_static PRIVATE diff --git a/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc index 0cd2daf4..7b3d1971 100644 --- a/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc +++ b/src/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc @@ -15,6 +15,7 @@ */ #include "hybrid/node_executor/aicpu/aicpu_node_executor.h" +#include "cce/taskdown_common.hpp" #include "common/formats/formats.h" #include "aicpu/common/aicpu_task_struct.h" #include "graph/load/new_model_manager/model_manager.h" @@ -593,6 +594,15 @@ Status AicpuNodeTask::Init(const HybridModel &model) { auto &args = kernel_def.args(); args_size_ = kernel_def.args_size(); + const std::string &so_name = kernel_def.so_name(); + const OpDescPtr op_desc = MakeShared(*(node_item_->op_desc)); + const auto &context = kernel_def.context(); + auto kernel_type = static_cast(context.kernel_type()); + if (kernel_type == cce::ccKernelType::CUST_AI_CPU) { + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name), "load cust aicpu so failed."); + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed."); + } + GE_CHK_BOOL_RET_STATUS(args.size() == args_size_, FAILED, "Node[%s] task def args.size=%zu, but args_size=%u.", node_name.c_str(), args.size(), args_size_); @@ -676,7 +686,12 @@ Status AicpuNodeTask::LaunchTask(TaskContext &context) { GELOGI("Node[%s] launch task start. unknown_type=%d.", node_name_.c_str(), unknown_type_); const auto &so_name = task_def_.kernel().so_name(); const auto &kernel_name = task_def_.kernel().kernel_name(); + const auto &kcontext = task_def_.kernel().context(); + auto kernel_type = static_cast(kcontext.kernel_type()); uint32_t flag = RT_KERNEL_DEFAULT; + if (kernel_type == cce::ccKernelType::CUST_AI_CPU) { + flag |= RT_KERNEL_CUSTOM_AICPU; + } auto rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast(so_name.c_str()), reinterpret_cast(kernel_name.c_str()), 1, // default core dim is 1 diff --git a/src/ge/ir_build/ge_ir_build.cc b/src/ge/ir_build/ge_ir_build.cc index 82344252..b526342d 100644 --- a/src/ge/ir_build/ge_ir_build.cc +++ b/src/ge/ir_build/ge_ir_build.cc @@ -438,6 +438,12 @@ graphStatus aclgrphInferShapeAndType(ge::Graph &graph) { auto compute_graph = GraphUtils::GetComputeGraph(graph); GE_CHECK_NOTNULL(compute_graph); + auto ret = compute_graph->InferOriginFormat(); + if (ret != GRAPH_SUCCESS) { + GELOGE(ret, "Acl InferOriginFormat failed."); + return ret; + } + for (auto &node : compute_graph->GetAllNodes()) { graphStatus ret = ShapeRefiner::InferShapeAndType(node); if (ret == GRAPH_PARAM_INVALID) { diff --git a/src/ge/opskernel_manager/optimizer_priority.pbtxt b/src/ge/opskernel_manager/optimizer_priority.pbtxt index 1c9522c9..a53184a1 100755 --- a/src/ge/opskernel_manager/optimizer_priority.pbtxt +++ b/src/ge/opskernel_manager/optimizer_priority.pbtxt @@ -1 +1 @@ -optimizer:["aicpu_tf_optimizer","AIcoreEngine","VectorEngine","aicpu_ascend_optimizer","hccl_graph_optimizer", "hvd_graph_optimizer", "DNN_VM_RTS_GRAPH_OPTIMIZER_STORE"] +optimizer:["aicpu_tf_optimizer","aicpu_ascend_optimizer","AIcoreEngine","VectorEngine","hccl_graph_optimizer", "hvd_graph_optimizer", "DNN_VM_RTS_GRAPH_OPTIMIZER_STORE"] diff --git a/src/ge/session/omg.cc b/src/ge/session/omg.cc index 2d9867e7..8fe9bbb5 100644 --- a/src/ge/session/omg.cc +++ b/src/ge/session/omg.cc @@ -995,8 +995,10 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertFwkModelToJson(const domi::FrameworkType ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, - {"--framework", std::to_string(framework), "only support 0(Caffe) 3(TensorFlow)"}); - GELOGE(PARAM_INVALID, "Input parameter[--framework] is mandatory and it's value must be: 0(Caffe) 3(TensorFlow)."); + {"--framework", std::to_string(framework), "only support 0(Caffe) 3(TensorFlow) 5(Onnx)"}); + GELOGE(PARAM_INVALID, + "Input parameter[--framework] is mandatory and it's value must be: 0(Caffe) 3(TensorFlow) " + "or 5(Onnx)."); return PARAM_INVALID; } @@ -1039,6 +1041,7 @@ void UpdateOmgCtxWithParserCtx() { domi::GetContext().out_top_names = GetParserContext().out_top_names; domi::GetContext().user_out_nodes_top_vec = GetParserContext().user_out_nodes_top_vec; domi::GetContext().default_out_nodes = GetParserContext().default_out_nodes; + domi::GetContext().data_top_names = GetParserContext().data_top_names; } void UpdateParserCtxWithOmgCtx() { @@ -1055,5 +1058,6 @@ void UpdateParserCtxWithOmgCtx() { GetParserContext().input_nodes_format_map = domi::GetContext().input_nodes_format_map; GetParserContext().out_top_names = domi::GetContext().out_top_names; GetParserContext().user_out_nodes_top_vec = domi::GetContext().user_out_nodes_top_vec; + GetParserContext().data_top_names = domi::GetContext().data_top_names; } } // namespace ge diff --git a/src/ge/single_op/single_op_model.cc b/src/ge/single_op/single_op_model.cc index fb676008..9c6ad8d9 100644 --- a/src/ge/single_op/single_op_model.cc +++ b/src/ge/single_op/single_op_model.cc @@ -31,6 +31,7 @@ #include "task/aicpu_task_builder.h" #include "task/aicpu_kernel_task_builder.h" #include "task/tbe_task_builder.h" +#include "graph/load/new_model_manager/model_manager.h" static std::atomic aicpu_sessionid(0); @@ -187,6 +188,7 @@ Status SingleOpModel::LoadAllNodes() { } ge_model->GetTBEKernelStore().LoadTBEKernelBinToOpDesc(op_desc); + ge_model->GetCustAICPUKernelStore().LoadCustAICPUKernelBinToOpDesc(op_desc); } return SUCCESS; @@ -244,7 +246,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); ParseArgTable(tbe_task, single_op); single_op.tasks_.emplace_back(tbe_task); - } else if (kernel_type == cce::ccKernelType::AI_CPU) { + } else if (kernel_type == cce::ccKernelType::AI_CPU || kernel_type == cce::ccKernelType::CUST_AI_CPU) { GELOGD("Building AICPU_CC task"); OpTask *task = nullptr; auto ret = BuildCpuKernelTask(task_def.kernel(), &task); @@ -253,7 +255,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { } single_op.tasks_.emplace_back(task); } else { - GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type()); + GELOGE(UNSUPPORTED, "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type()); return UNSUPPORTED; } } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { @@ -273,6 +275,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { GELOGD("Skip task type: %d", static_cast(task_type)); } } + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed."); return SUCCESS; } @@ -388,13 +391,13 @@ Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingl TbeOpTask *tbe_task = nullptr; GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task)); single_op.op_task_.reset(tbe_task); - } else if (kernel_type == cce::ccKernelType::AI_CPU) { + } else if (kernel_type == cce::ccKernelType::AI_CPU || kernel_type == cce::ccKernelType::CUST_AI_CPU) { GELOGD("Building AICPU_CC task"); OpTask *task = nullptr; GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task)); single_op.op_task_.reset(task); } else { - GELOGE(UNSUPPORTED, "Only TBE kernel and AI_CPU kernel are supported, but got %u", context.kernel_type()); + GELOGE(UNSUPPORTED, "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type()); return UNSUPPORTED; } return SUCCESS; @@ -444,6 +447,7 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { GELOGD("Skip task type: %d", static_cast(task_type)); } } + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed."); return SUCCESS; } diff --git a/src/ge/single_op/task/aicpu_kernel_task_builder.cc b/src/ge/single_op/task/aicpu_kernel_task_builder.cc index cc334f41..eb9d4d5c 100644 --- a/src/ge/single_op/task/aicpu_kernel_task_builder.cc +++ b/src/ge/single_op/task/aicpu_kernel_task_builder.cc @@ -15,6 +15,8 @@ */ #include "single_op/task/aicpu_kernel_task_builder.h" +#include "cce/taskdown_common.hpp" +#include "graph/load/new_model_manager/model_manager.h" namespace ge { AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def) @@ -55,6 +57,14 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task) { task.SetkernelName(kernel_name); task.op_desc_ = op_desc_; + const auto &context = kernel_def_.context(); + auto kernel_type = static_cast(context.kernel_type()); + if (kernel_type == cce::ccKernelType::CUST_AI_CPU) { + task.is_custom_ = true; + task.dump_flag_ |= RT_KERNEL_CUSTOM_AICPU; + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name), "launch cust aicpu so failed"); + } + task.num_inputs_ = op_desc_->GetInputsSize(); task.num_outputs_ = op_desc_->GetOutputsSize(); diff --git a/src/ge/single_op/task/build_task_utils.cc b/src/ge/single_op/task/build_task_utils.cc index 268cbfd1..7a2e889a 100644 --- a/src/ge/single_op/task/build_task_utils.cc +++ b/src/ge/single_op/task/build_task_utils.cc @@ -45,6 +45,7 @@ std::vector> BuildTaskUtils::GetAddresses(const OpDescPtr &o runtime_para.logic_var_base = kLogicVarBase; runtime_para.var_base = kVarBase; runtime_para.session_id = kSessionId; + runtime_para.is_single_op = true; ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc)); ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc)); diff --git a/src/ge/single_op/task/op_task.cc b/src/ge/single_op/task/op_task.cc index 78db835e..7be65ec2 100644 --- a/src/ge/single_op/task/op_task.cc +++ b/src/ge/single_op/task/op_task.cc @@ -260,8 +260,8 @@ Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info) { return SUCCESS; } -Status AiCpuBaseTask::UpdateExtInfo(const std::vector &input_desc, - std::vector &output_desc) { +Status AiCpuBaseTask::UpdateExtInfo(const std::vector &input_desc, std::vector &output_desc, + rtStream_t stream) { GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_); if (num_inputs_ == 0 && num_outputs_ == 0) { GELOGI("No input and output, no need update ext info."); @@ -278,15 +278,13 @@ Status AiCpuBaseTask::UpdateExtInfo(const std::vector &input_desc, for (size_t j = 0; j < num_outputs_; ++j) { GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]), "Output[%zu] UpdateOutputShapeAndType failed.", j); - // debug code - GELOGD("No input and output, no need update ext info."); } } - GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, - aicpu_ext_handle_->GetExtInfoLen(), // check size - aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), - RT_MEMCPY_HOST_TO_DEVICE)); + GE_CHK_RT_RET(rtMemcpyAsync(ext_info_addr_dev_, + aicpu_ext_handle_->GetExtInfoLen(), // check size + aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), + RT_MEMCPY_HOST_TO_DEVICE_EX, stream)); GELOGI("Update ext info end."); return SUCCESS; @@ -599,7 +597,7 @@ Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) { Status AiCpuTask::LaunchKernel(const std::vector &input_desc, const std::vector &input_buffers, std::vector &output_desc, std::vector &output_buffers, rtStream_t stream) { - GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc)); + GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream)); std::vector inputs; std::vector outputs; for (auto &buffer : input_buffers) { @@ -610,11 +608,12 @@ Status AiCpuTask::LaunchKernel(const std::vector &input_desc, } GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs)); GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream)); - GE_CHK_RT_RET(rtStreamSynchronize(stream)); if (unknown_type_ == DEPEND_SHAPE_RANGE) { + GE_CHK_RT_RET(rtStreamSynchronize(stream)); GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc)); } else if (unknown_type_ == DEPEND_COMPUTE) { + GE_CHK_RT_RET(rtStreamSynchronize(stream)); GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream)); } @@ -647,9 +646,9 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) { kernel_name_.data()); // sm_desc is nullptr, because l2 buffer does not support auto *sm_desc = reinterpret_cast(sm_desc_); - auto ret = - rtCpuKernelLaunch(static_cast(so_name_.data()), static_cast(kernel_name_.data()), - block_dim_, args_.get(), static_cast(arg_size_), sm_desc, stream); + auto ret = rtCpuKernelLaunchWithFlag(static_cast(so_name_.data()), + static_cast(kernel_name_.data()), block_dim_, args_.get(), + static_cast(arg_size_), sm_desc, stream, dump_flag_); if (ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Invoke rtCpuKernelLaunch failed. ret = %d", ret); return RT_FAILED; @@ -665,7 +664,7 @@ Status AiCpuCCTask::LaunchKernel(const std::vector &input_desc, GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED, "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.", unknown_type_); - GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc)); + GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream)); size_t arg_index = 0; auto *task_io_addr = reinterpret_cast(io_addr_); @@ -678,9 +677,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector &input_desc, } GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream)); - GE_CHK_RT_RET(rtStreamSynchronize(stream)); if (unknown_type_ == DEPEND_SHAPE_RANGE) { + GE_CHK_RT_RET(rtStreamSynchronize(stream)); GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc)); } diff --git a/src/ge/single_op/task/op_task.h b/src/ge/single_op/task/op_task.h index 5f742197..0c65f8a3 100644 --- a/src/ge/single_op/task/op_task.h +++ b/src/ge/single_op/task/op_task.h @@ -118,7 +118,8 @@ class AiCpuBaseTask : public OpTask { protected: Status SetExtInfoAndType(const std::string &kernel_ext_info); - Status UpdateExtInfo(const std::vector &input_desc, std::vector &output_desc); + Status UpdateExtInfo(const std::vector &input_desc, std::vector &output_desc, + rtStream_t stream); Status UpdateOutputShape(vector &output_desc); Status UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc); @@ -214,6 +215,8 @@ class AiCpuCCTask : public AiCpuBaseTask { uint32_t block_dim_ = 1; void *sm_desc_ = nullptr; void *io_addr_ = nullptr; + bool is_custom_ = false; + uint32_t dump_flag_ = RT_KERNEL_DEFAULT; }; } // namespace ge diff --git a/src/proto/insert_op.proto b/src/proto/insert_op.proto index a059e122..47646163 100644 --- a/src/proto/insert_op.proto +++ b/src/proto/insert_op.proto @@ -61,6 +61,9 @@ message AippOpParams { // ±êʶ¶ÔÄ£Ð͵ĵڼ¸¸öÊäÈë×öAIPP´¦Àí£¬ÀýÈçÄ£ÐÍÓÐÁ½¸öÊäÈ룬ÐèÒª¶ÔµÚ2¸öÊäÈë×öAIPP£¬ÔòÅäÖÃrelated_input_rankΪ1¡£ uint32 related_input_rank = 2; + // related_input_name is optional and the top name of data node which inserts aipp + string related_input_name = 6; + // input_edge_idx²ÎÊýΪ¿ÉÑ¡£¬ÀàÐÍΪÕûÐÍ£¬ÅäÖ÷¶Î§Îª>=0¡£ // ÅäÖøòÎÊýµÄ×÷Óã¬ÔÚÓÚ¶ÔDataËã×Ó²»Í¬µÄÊä³ö×ö²»Í¬µÄAIPP´¦Àí£¬Èç¹û¸Ã²ÎÊýûÓÐÅäÖã¬Ä¬È϶Ôrelated_input_rankÖ¸¶¨µÄÄ£ÐÍÊäÈëµÄËùÓÐÊä³ö±ß×öAIPP¡£ // ÅäÖÃÖµ <= DataËã×ÓÊä³ö±ßµÄ¸öÊý¡£ diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h index f543fa4d..8194097e 100644 --- a/third_party/fwkacllib/inc/hccl/base.h +++ b/third_party/fwkacllib/inc/hccl/base.h @@ -68,8 +68,10 @@ struct MemRegisterAddr { u64 addr; u64 length; }; - -const u32 HCCL_MAX_MEM_REGISTER_NUM = 1024 * 1024; // The max number of memory register address is 1M (1024 * 1024). +/* + * @brief The max number of memory register address for remote access. + */ +const u32 HCCL_MAX_MEM_REGISTER_NUM = 32; enum GradSplitForceMode { FORCE_NONE, /**< no force */ diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h index 461b3617..bb937a75 100644 --- a/third_party/fwkacllib/inc/ops/data_flow_ops.h +++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h @@ -2240,6 +2240,64 @@ REG_OP(OutfeedEnqueueOp) .ATTR(channel_name, String, "") .OP_END_FACTORY_REG(OutfeedEnqueueOp) +/** +*@brief LruCache, create cache resource. +*@par Inputs: +*No input. +*@par Attributes: +*cache_size: cache size An optional "int64". Defaults to "100000". +*load_factor: rate which show if cache is full An optional "float", Defaults to "1". +*@par Outputs: +*cache: cache resource. +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ +REG_OP(LruCache) + .OUTPUT(cache, TensorType({DT_RESOURCE})) + .ATTR(container, String, "") + .ATTR(shared_name, String, "LruCache") + .ATTR(cache_size, Int, 100000) + .ATTR(load_factor, Float, 1) + .OP_END_FACTORY_REG(LruCache) + +/** +*@brief CacheAdd, get id new come in cache and id get out of cache. +*@par Inputs: +*cache: resource data +*ids: Tensor stored id need to insert cache +*@par Outputs: +*swap_in_id: id come in cache. +*swap_in_idx: id in cache which come in cache +*swap_out_id: id get out of cache +*swap_out_idx: id in cache which get out of cache +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ +REG_OP(CacheAdd) + .INPUT(cache, TensorType({DT_RESOURCE})) + .INPUT(ids, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32})) + .OUTPUT(swap_in_id, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32})) + .OUTPUT(swap_in_idx, TensorType({DT_INT64})) + .OUTPUT(swap_out_id, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32})) + .OUTPUT(swap_out_idx, TensorType({DT_INT64})) + .OP_END_FACTORY_REG(CacheAdd) + +/** +*@brief CacheRemoteToLocalIndex, get id in cache from id. +*@par Inputs: +*cache: resource data +*ids: Tensor stored id need to insert cache +*@par Outputs: +*local_idx: id in cache. +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ +REG_OP(CacheRemoteIndexToLocal) + .INPUT(cache, TensorType({DT_RESOURCE})) + .INPUT(ids, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32})) + .OUTPUT(local_idx, TensorType({DT_INT64})) + .OP_END_FACTORY_REG(CacheRemoteIndexToLocal) + } // namespace ge #endif // OPS_BUILT_IN_OP_PROTO_INC_DATA_FLOW_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h index 536dea63..c64bc138 100644 --- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h @@ -2802,6 +2802,80 @@ REG_OP(AdamApplyOneAssign) .OUTPUT(output2, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(AdamApplyOneAssign) +/** +*@brief A fusion operator for bert lamb. \n + +*@par Inputs: +*Ten inputs, including: +* @li input0: A Tensor. Must be one of the following types: float16, float32. +* @li input1: A Tensor. Must be one of the following types: float16, float32. +* @li input2: A Tensor. Must be one of the following types: float16, float32. +* @li input3: A Tensor. Must be one of the following types: float16, float32. +* @li input4: A Tensor. Must be one of the following types: float16, float32. +* @li mul0_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul1_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul2_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul3_x: A Tensor. Must be one of the following types: float16, float32. +* @li steps: A Tensor. Must be one of the following types: float16, float32. +* @li do_use_weight: A Tensor. Must be one of the following types: float16, float32. +* @li weight_decay_rate: A Tensor. Must be one of the following types: float16, float32. +* @li add2_y: A Tensor. Must be one of the following types: float16, float32. \n + +*@par Outputs: +*Three outputs, including: +* @li output0: A Tensor. Must be one of the following types: float16, float32. \n + +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ +REG_OP(LambApplyOptimizerAssign) + .INPUT(input0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul0_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul1_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul2_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul3_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(add2_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(steps, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(do_use_weight, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(weight_decay_rate, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(output0, TensorType({DT_FLOAT16,DT_FLOAT})) + .OP_END_FACTORY_REG(LambApplyOptimizerAssign) + +/** +*@brief A fusion operator for bert lamb. \n + +*@par Inputs: +*Ten inputs, including: +* @li input0: A Tensor. Must be one of the following types: float16, float32. +* @li input1: A Tensor. Must be one of the following types: float16, float32. +* @li input2: A Tensor. Must be one of the following types: float16, float32. +* @li input3: A Tensor. Must be one of the following types: float16, float32. +* @li input4: A Tensor. Must be one of the following types: float16, float32. +* @li mul0_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul1_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul2_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul3_x: A Tensor. Must be one of the following types: float16, float32. +* @li steps: A Tensor. Must be one of the following types: float16, float32. +* @li do_use_weight: A Tensor. Must be one of the following types: float16, float32. +* @li weight_decay_rate: A Tensor. Must be one of the following types: float16, float32. +* @li add2_y: A Tensor. Must be one of the following types: float16, float32. \n + +*@par Outputs: +*No outputs +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ +REG_OP(LambApplyWeightAssign) + .INPUT(input0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input4, TensorType({DT_FLOAT16,DT_FLOAT})) + .OP_END_FACTORY_REG(LambApplyWeightAssign) + /** *@brief Confuse select, maximum, greater and sqrt. \n diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h index ff51bb2f..149e0e37 100644 --- a/third_party/fwkacllib/inc/ops/math_ops.h +++ b/third_party/fwkacllib/inc/ops/math_ops.h @@ -495,51 +495,51 @@ REG_OP(NextAfter) .OP_END_FACTORY_REG(NextAfter) /** - * *@brief Compute element-wise finiteness, return a boolean tensor. - * - * *@par Inputs: - * *x:A Tensor. - * - * *@par Outputs: - * *y:A Tensor. Has the same shape as x. - * - * *@par Third-party framework compatibility. - * *Compatible with tensorflow IsFinite operator. - * */ + *@brief Compute element-wise finiteness, return a boolean tensor. + + *@par Inputs: + *x:A Tensor. + + *@par Outputs: + *y:A Tensor. Has the same shape as x. + + *@par Third-party framework compatibility. + *Compatible with tensorflow IsFinite operator. + */ REG_OP(IsFinite) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(IsFinite) /** - * *@brief Compute element-wise infiniteness, return a boolean tensor. - * - * *@par Inputs: - * *x:A Tensor. - * - * *@par Outputs: - * *y:A Tensor. Has the same shape as x. - * - * *@par Third-party framework compatibility. - * *Compatible with tensorflow IsInf operator. - * */ + *@brief Compute element-wise infiniteness, return a boolean tensor. + + *@par Inputs: + *x:A Tensor. + + *@par Outputs: + *y:A Tensor. Has the same shape as x. + + *@par Third-party framework compatibility. + *Compatible with tensorflow IsInf operator. + */ REG_OP(IsInf) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(IsInf) /** - * *@brief Computes the complex absolute value of a tensor. - * - * *@par Inputs: - * *x:A Tensor. - * - * *@par Outputs: - * *y:A tensor of type `float` or `double` that is the absolute value of each element in `x`. - * - * *@par Third-party framework compatibility. - * *Compatible with tensorflow ComplexAbs operator. - * */ + *@brief Computes the complex absolute value of a tensor. + + *@par Inputs: + *x:A Tensor. + + *@par Outputs: + *y:A tensor of type `float` or `double` that is the absolute value of each element in `x`. + + *@par Third-party framework compatibility. + *Compatible with tensorflow ComplexAbs operator. + */ REG_OP(ComplexAbs) .INPUT(x, TensorType({DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -547,34 +547,34 @@ REG_OP(ComplexAbs) .OP_END_FACTORY_REG(ComplexAbs) /** - * *@brief Returns which elements of x are NaN. - * - * *@par Inputs: - * *x:A Tensor. - * - * *@par Outputs: - * *y:A Tensor. Has the same shape as x. - * - * *@par Third-party framework compatibility. - * *Compatible with tensorflow IsNan operator. - * */ + *@brief Returns which elements of x are NaN. + + *@par Inputs: + *x:A Tensor. + + *@par Outputs: + *y:A Tensor. Has the same shape as x. + + *@par Third-party framework compatibility. + *Compatible with tensorflow IsNan operator. + */ REG_OP(IsNan) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(IsNan) /** - * *@brief Returns the real part of a complex number. - * - * *@par Inputs: - * *input:A Tensor. - * - * *@par Outputs: - * *output:A Tensor. Has the same shape as input. - * - * *@par Third-party framework compatibility. - * *Compatible with tensorflow Real operator. - * */ + *@brief Returns the real part of a complex number. + + *@par Inputs: + *input:A Tensor. + + *@par Outputs: + *output:A Tensor. Has the same shape as input. + + *@par Third-party framework compatibility. + *Compatible with tensorflow Real operator. + */ REG_OP(Real) .INPUT(input, TensorType({DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -582,17 +582,17 @@ REG_OP(Real) .OP_END_FACTORY_REG(Real) /** - * *@brief Returns the complex conjugate of a complex number. - * - * *@par Inputs: - * *input:A Tensor. - * - * *@par Outputs: - * *output:A Tensor. Has the same shape as input. - * - * *@par Third-party framework compatibility. - * *Compatible with tensorflow output operator. - * */ + *@brief Returns the complex conjugate of a complex number. + + *@par Inputs: + *input:A Tensor. + + *@par Outputs: + *output:A Tensor. Has the same shape as input. + + *@par Third-party framework compatibility. + *Compatible with tensorflow output operator. + */ REG_OP(Conj) .INPUT(input, TensorType({DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(output, TensorType({DT_COMPLEX64, DT_COMPLEX128})) @@ -698,15 +698,14 @@ REG_OP(IFMR) *@par Inputs: *@li w:A Tensor of weights. \n +*@li w_min:A Tensor of weights reduce_min. \n +*@li w_max:A Tensor of weights reduce_max. \n *@par Attributes: -*axes: specify channel. *num_bits: the bits num used for quantize. *offset_flag: whether using offset. \n *@par Outputs: -*scale: quantization factor scale. -*offset: quantization factor offset. *y: fake quantized weights. \n *@par Third-party framework compatibility @@ -715,10 +714,9 @@ REG_OP(IFMR) REG_OP(WtsARQ) .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT})) - .OUTPUT(scale, TensorType({DT_FLOAT16, DT_FLOAT})) - .OUTPUT(offset, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(w_min, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(w_max, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) - .ATTR(axes, ListInt, {0}) .ATTR(num_bits, Int, 8) .ATTR(offset_flag, Bool, false) .OP_END_FACTORY_REG(WtsARQ) diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h index c5a6a523..c0576dd5 100644 --- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h @@ -582,103 +582,105 @@ REG_OP(Conv2DBackpropFilterD) /** *@brief Computes a 2D convolution given 4D "x" and "filter" tensors. *@par Inputs: -*@li x: A 4D tensor of input images. With "NHWC" format, the shape is -* [batch, in_height, in_width, in_channels]. -*@li filter: A 4D tensor of filters. Has the same type as "x". With "HWCN" -* format, the shape is [filter_height, filter_width, in_channels, -* out_channels]. - -*@li bias: An optional 1D tensor. Shape is [out_channels]. -*@li offset_w: An optional 1D tensor for quantized convolution. Shape is -* [out_channels]. Not supported. +*@li x: A 4D tensor of input image. With the format "NHWC", the data is stored +* in the order of: [batch, in_height, in_width, in_channels]. +*@li filter: A 4D tensor of learnable filters. Must have the same type as "x". +* With the format "HWCN" , the data is stored in the order of: [filter_height, +* filter_width, in_channels / groups, out_channels]. +*@li bias: An optional 1D tensor of additive biases to the filter outputs. +* The data is stored in the order of: [out_channels]. +*@li offset_w: Reserved. *\n *\n -* Note that there is a strict data type mapping between the input and output -* tensors: +* The following are the supported data types and data formats: *@verbatim - |Tensor | x | filter | bias | offset_w | y - -----------|---------|---------|---------|----------|-------- - |Data Type | float16 | float16 | float16 | _ | float16 - | |---------|---------|---------|----------|-------- - | | float32 | float32 | float32 | _ | float32 - | |---------|---------|---------|----------|-------- - | | int8 | int8 | int32 | int8 | int32 - -----------|---------|---------|---------|----------|-------- - |Format | NCHW | NCHW | ND | ND | NCHW - | | NHWC | HWCN | | | NHWC + | Tensor | x | filter | bias | y + ------------|---------|---------|---------|-------- + | Data Type | float16 | float16 | float16 | float16 + | |---------|---------|---------|-------- + | | float32 | float32 | float32 | float32 + | |---------|---------|---------|-------- + | | int8 | int8 | int32 | int32 + ------------|---------|---------|---------|-------- + | Format | NCHW | NCHW | ND | NCHW + | | NHWC | HWCN | | NHWC @endverbatim -* Type float32 is allowed only in mixed precision (float32->float16) scenarios. -* Mixed precision is enabled by default. -* \n +* For float32 type, the actual calculation on the chip is based on +* float16. For int8, a dequant or requant operator must be followed. +*\n * *@par Attributes: -*@li strides: Required. A list of 4 integers. Specifying the strides of the -* convolution along the height and width. The dimension order is determined -* by the data format of "x". By default the N and C dimensions are set to 1. -*@li pads: Required. A list of 4 integers. Specifying the top, bottom, left -* and right padding. -* @li dilations: Optional. A list of 4 integers. Specifying the dilation rate -* to use for dilated convolution. Has the same dimension order and value as -* "strides". Dilation > 1 is not supported for quantized convolution. Defaults -* to [1, 1, 1, 1]. -* @li groups: Optional. An integer of type int32, for the number of blocked -* connections from input channels to output channels. Input channels and output -* channels must both be divisible by "groups". "x" in_channels must be equal to -* "filter" in_channels * groups. Defaults to 1. -* @li offset_x: Optional. An integer of type int32, for quantized convolution. -* Defaults to 0. -* @li data_format: Reserved and optional. A string from: "NHWC" and "NCHW". -* Specifying the data format of the input and output images. Defaults to -* "NHWC". +*@li strides: Required. A list of 4 integers. The stride of the sliding window +* for each dimension of input. The dimension order is determined by the data +* format of "x". The N and C dimensions must be set to 1. +*@li pads: Required. A list of 4 integers. The number of pixels to add to each +* (top, bottom, left, right) side of the input. +*@li dilations: Optional. A list of 4 integers. The dilation factor for each +* dimension of input. The dimension order is determined by the data format of +* "x". The N and C dimensions must be set to 1. The H and W dimensions must be +* set to 1 for int8 type. Defaults to [1, 1, 1, 1]. +*@li groups: Optional. An integer of type int32. The number of blocked +* connections from input channels to output channels. In_channels and +* out_channels must both be divisible by "groups". Defaults to 1. +*@li offset_x: Optional. An integer of type int32. The negative offset added +* to the input image for int8 type. Ensure that the output is within the +* effective range. Defaults to 0. +*@li data_format: Reserved. *\n *\n * The following value range restrictions must be met: *@verbatim - |Name | Field | Scope - ------------------|----------|---------- - |Input Image Size | H | [1, 100000] - | | W | [1, 4096] - ------------------|----------|---------- - |Filter Size | H | [1, 255] - | | W | [1, 255] - ------------------|----------|---------- - |Stride | H | [1, 63] - | | W | [1, 63] - ------------------|----------|---------- - |Padding | top | [0, 255] - | | bottom | [0, 255] - | | left | [0, 255] - | | right | [0, 255] - ------------------|----------|---------- - |Dilation | H | [1, 255] - | | W | [1, 255] + | Name | Field | Scope + -------------------|----------|-------------- + | Input Image Size | H | [1, 100000] + | | W | [1, 4096] + -------------------|----------|-------------- + | Filter Size | H | [1, 255] + | | W | [1, 255] + -------------------|----------|-------------- + | Stride | H | [1, 63] + | | W | [1, 63] + -------------------|----------|-------------- + | Padding | Top | [0, 255] + | | Bottom | [0, 255] + | | Left | [0, 255] + | | Right | [0, 255] + -------------------|----------|-------------- + | Dilation | H | [1, 255] + | | W | [1, 255] + -------------------|----------|-------------- + | Offset_x | | [-128, 127] + @endverbatim +*\n * *@par Outputs: -*@li y: A 4D Tensor of output images. Has the same type and format as "x". With -* "NHWC" format, the shape is [batch, out_height, out_width, out_channels]. +*@li y: A 4D Tensor of output feature map. Has the same type as "x". With the +* format "NHWC", the data is stored in the order of: [batch, out_height, +* out_width, out_channels]. *\n -* out_height = (in_height + top_pad + bottom_pad - -* dilation_h * (filter_height - 1) - 1) +* out_height = (in_height + pad_top + pad_bottom - +* (dilation_h * (filter_height - 1) + 1)) * / stride_h + 1 *\n -* out_width = (in_width + left_pad + right_pad - -* dilation_w * (filter_width - 1) - 1) -* / stride_w + 1 +* out_width = (in_width + pad_left + pad_right - +* (dilation_w * (filter_width - 1) + 1)) +* / stride_w + 1 * *@attention Constraints: *@li The following restrictions on the output must be met: *@verbatim - | Output | Restrictions - -------------------|--------------------------- - | W dimension == 1 | H*W(input) == H*W(filter) - | H dimension == 1 | - -------------------|--------------------------- - | W dimension == 1 | Not supported - | H dimension != 1 | + | Output | Restrictions + ----------|-------------------------------- + | H == 1 | H * W(input) == H * W(filter) + | W == 1 | + ----------|-------------------------------- + | H != 1 | W(input) == W(filter) + | W == 1 | Only for Ascend310 Hi3796V300CS @endverbatim * "H * W (input)" indicates the image size after padding and "H * W (filter)" -* indicates the filter size after dilation. +* indicates the filter size after dilation."W(input)" and W(filter) indicate +* the same rule on the W dimension. *\n * *@par Quantization supported or not @@ -767,106 +769,112 @@ REG_OP(Conv2DCompress) .OP_END_FACTORY_REG(Conv2DCompress) /** -*@brief Computes a 2D convolution given 4D "x", "filter" and "offsets" -* tensors. +*@brief Computes a 2D deformable convolution given 4D "x", "filter" and +* "offsets" tensors. *@par Inputs: -* @li x: A 4D tensor of input images. With shape of -* [batch, in_height, in_width, in_channels] when format is "NHWC". -* @li filter: A 4D tensor of filters. Must have the same type as "x". With -* shape of [filter_height, filter_width, in_channels, out_channels] when format -* is "HWCN". -* @li offsets: A 4D tensor of offsets. With shape of -* [batch, deformable_groups * filter_height * filter_width * 3, in_height, -* in_width] when format is "NCHW". -* @li bias: An optional 1D tensor. Shape is [out_channels]. -* -* The input and output tensor attributes are listed as follows: -* @verbatim - |Tensor | x | filter | offsets | bias | y - -----------|---------|---------|---------|----------|-------- - |Data Type | float16 | float16 | float16 | float16 | float16 - -----------|---------|---------|---------|----------|-------- - |Format | NCHW | NCHW | NCHW | ND | NCHW - | | NHWC | HWCN | | | NHWC +*@li x: A 4D tensor of input image. With the format "NHWC", the data is stored +* in the order of: [batch, in_height, in_width, in_channels]. +*@li filter: A 4D tensor of learnable filters. Must have the same type as "x". +* With the format "HWCN" , the data is stored in the order of: [filter_height, +* filter_width, in_channels / groups, out_channels]. +*@li offsets: A 4D tensor of x-y coordinates offset and mask. With the format +* "NHWC", the data is stored in the order of: [batch, out_height, out_width, +* deformable_groups * filter_height * filter_width * 3]. +*@li bias: An optional 1D tensor of additive biases to the filter outputs. +* The data is stored in the order of: [out_channels]. +*\n +*\n +* The following are the supported data types and data formats: +*@verbatim + | Tensor | x | filter | offsets | bias | y + ------------|---------|---------|---------|----------|-------- + | Data Type | float16 | float16 | float16 | float16 | float16 + ------------|---------|---------|---------|----------|-------- + | Format | NCHW | NCHW | NCHW | ND | NCHW + | | NHWC | HWCN | NHWC | | NHWC @endverbatim -* It should be noted that the data types must correspond to each other, but -* the format does not need to. - +*\n +* *@par Attributes: -* @li strides: Required. A list of 4 integers. Specifying the strides of the -* convolution along the height and width. The dimension order is determined -* by the data format of "x". By default the N and C dimensions are set to 1. -* @li pads: Required. A list of 4 integers. Specifying the top, bottom, left -* and right padding. -* @li dilations: Optional. A list of 4 integers. Specifying the dilation rate -* to use for dilated convolution. Has the same dimension order and value as -* "strides". -* @li groups: Optional. Number of blocked connections from input channels to -* output channels. Input channels and output channels must both be divisible -* by "groups".Type is int32. -* @li data_format: Optional. An optional string from: "NHWC", "NCHW". Specifying the -* data format of the input and output images. Type is string. Defaults to -* "NHWC". Reserved. -* @li deformable_groups: Optional. Cut the c chanel of input X into deformable_groups, -* each share a different offsets. Input channels must be divisible by -* "deformable_groups". Type is int32. - -*@par Outputs: -* @li y: A 4D Tensor of output images. Must have the same type and format as -* "x". With shape of [batch, out_channels, out_height, out_width] when format -* is "NHWC". -* @li output_height = (in_height + top_pad + botton_pad - -* dilation_h * (filter_height - 1) -1) / stride_h + 1 -* @li output_width = (in_width + left_pad + right_pad - -* dilation_w * (filter_width - 1) -1) / stride_w + 1 - -*@attention -* @li The parameter scope is listed as follows: -* @verbatim - |Name | Field | Scope - ------------------|--------------|---------------------------------------- - |Input Image Size | H dimension | 1 <= in_height * filter_height <= 4096 - | | W dimension | 1 <= in_width * filter_width <=4096 - ------------------|--------------|---------------------------------------- - |Filter Size | H dimension | [1, 255] - | | W dimension | [1, 255] - ------------------|--------------|---------------------------------------- - |offsets Size | C dimension | offsets_c = deformable_groups * - | | | filter_width * filter_height * 3 - | | H dimension | the same as output H dimension - | | W dimension | the same as output W dimension - ------------------|--------------|---------------------------------------- - |Stride Size | H dimension | [1, 63] - | | W dimension | [1, 63] - ------------------|--------------|---------------------------------------- - |Padding Size | top side | [0, 255] - | | bottom side | [0, 255] - | | left side | [0, 255] - | | right side | [0, 255] - ------------------|--------------|---------------------------------------- - |Dilation Size | H dimension | [1, 255] - | | W dimension | [1, 255] +*@li strides: Required. A list of 4 integers. The stride of the sliding window +* for each dimension of input. The dimension order is interpreted according to +* the value of data_format. The N and C dimensions must be set to 1. +*@li pads: Required. A list of 4 integers. The number of pixels to add to each +* (top, bottom, left, right) side of the input. +*@li dilations: Optional. A list of 4 integers. The dilation factor for each +* dimension of input. The dimension order is interpreted according to the value +* of data_format The N and C dimensions must be set to 1. Defaults to +* [1, 1, 1, 1]. +*@li groups: Optional. An integer of type int32. The number of blocked +* connections from input channels to output channels. In_channels and +* out_channels must both be divisible by "groups". Defaults to 1. +*@li data_format: Optional. An optional string from: "NHWC", "NCHW". Specify +* the data format of the input and output data. Defaults to "NHWC". +*@li deformable_groups: Optional. An integer of type int32. The number of +* deformable group partitions. In_channels must be divisible by +* "deformable_groups". Defaults to 1. +*\n +*\n +* The following value range restrictions must be met: +*@verbatim + | Name | Field | Scope + --------------------|--------|---------------------------- + | Input Image Size | H | [1, 100000 / H(filter)] + | | W | [1, 4096 / W(filter)] + --------------------|--------|---------------------------- + | Filter Size | H | [1, 255] + | | W | [1, 255] + --------------------|--------|---------------------------- + | Stride | H | [1, 63] + | | W | [1, 63] + --------------------|--------|---------------------------- + | Padding | Top | [0, 255] + | | Bottom | [0, 255] + | | Left | [0, 255] + | | Right | [0, 255] + ------------ -------|--------|---------------------------- + | Dilation | H | [1, 255] + | | W | [1, 255] @endverbatim - -* @li There are restrictions for certain scenarios: -* @verbatim - | Output | Restrictions - -------------------|--------------------------- - | W dimension == 1 | HxW(input) == HxW(filter) - | H dimension == 1 | - -------------------|--------------------------- - | W dimension == 1 | Not supported - | H dimension != 1 | +* "W(input)" indicate the image width after padding and W(filter) indicates the +* filter width after dilation. +*\n +* +*@par Outputs: +*@li y: A 4D Tensor of output feature map. Has the same type as "x". With the +* format "NHWC", the data is stored in the order of: [batch, out_height, +* out_width, out_channels]. +*\n +* out_height = (in_height + pad_top + pad_bottom - +* (dilation_h * (filter_height - 1) + 1)) +* / stride_h + 1 +*\n +* out_width = (in_width + pad_left + pad_right - +* (dilation_w * (filter_width - 1) + 1)) +* / stride_w + 1 +* +*@attention Constraints: +*@li The following restrictions on the output must be met: +*@verbatim + | Output | Restrictions + ----------|-------------------------------- + | H == 1 | H * W(input) == H * W(filter) + | W == 1 | + ----------|-------------------------------- + | H != 1 | W(input) == W(filter) + | W == 1 | Only for Ascend310 Hi3796V300CS @endverbatim -* As shown above, "HxW(input)" indicates the image size after padding and -* "HxW(filter)" indicates the filter size after dilation. - +* "H * W(input)" indicates the image size after padding and "H * W(filter)" +* indicates the filter size after dilation. "W(input)" and W(filter) indicate +* the same rule on the W dimension. +* *@par Quantization supported or not -* Yes - +*@li No +* *@par Third-party framework compatibility -*@li Compatible with the TensorFlow operator "conv2d". -*@li Compatible with the Caffe operator 2D "Convolution". +*@li Compatible with the Mxnet operator "DeformableConvolution". +*@li Compatible with the Paddlepaddle operator "deformable_conv". +*@li Compatible with the Mmcv operator "deform_conv". */ REG_OP(DeformableConv2D) .INPUT(x, TensorType({DT_FLOAT16})) diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h index d72da385..ab35ba47 100644 --- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h @@ -1194,8 +1194,8 @@ REG_OP(MaxPoolGradWithArgmaxV2) * @par Inputs: * One input: -* x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int8, int16, -* int32, int64, uint8, uint16, qint8 +* x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int32, int64, +* uint8, int16, int8, uint16, qint8 * @par Attributes: * @li ksize: A required list of int8, int16, int32, or int64 values, @@ -1206,14 +1206,14 @@ REG_OP(MaxPoolGradWithArgmaxV2) * the input tensor. No default value. * @li padding_mode: A required string. Defaults to "CALCULATED". * @li pads:A required list of int8, int16, int32, or int64 values, -* a data to caculate when padding_mode is "SAME" and "CALCULATED". +* a data to caculate when padding_mode is "CALCULATED". * @li data_format: An optional string. Defaults to "NHWC" . * @li global_pooling bool, Whether to use the global pooling. * If global_pooling = true, kernel size and paddings will be ignored. * Default False -* @li ceil_mode:global_pooling (bool) – (bool) Whether to use the global pooling. -* If global_pooling = true, kernel size and paddings will be ignored. -* Default False \n +* @li ceil_mode: Whether to use the ceil function to calculate output +* height and width. False is the default. If it is set to False, +* the floor function will be used. Default False \n * @par Outputs: * y: A Tensor. Has the same type and format as input "x" . \n @@ -1230,8 +1230,8 @@ REG_OP(MaxPoolGradWithArgmaxV2) * Compatible with the TensorFlow operator MaxPool. */ REG_OP(MaxPoolV3) - .INPUT(x,TensorType({DT_FLOAT16, DT_FLOAT32})) - .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32})) + .INPUT(x,TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16, DT_QINT8})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16, DT_QINT8})) .REQUIRED_ATTR(ksize, ListInt) .REQUIRED_ATTR(strides, ListInt) .ATTR(padding_mode, String, "CALCULATED") @@ -1258,14 +1258,14 @@ REG_OP(MaxPoolV3) * the input tensor. No default value. * @li padding_mode: A required string. Defaults to "CALCULATED". * @li pads:A required list of int8, int16, int32, or int64 values, -* a data to caculate when padding_mode is "SAME" and "CALCULATED". +* a data to caculate when padding_mode is "CALCULATED". * @li data_format: An optional string. Defaults to "NHWC" . * @li global_pooling bool, Whether to use the global pooling. * If global_pooling = true, kernel size and paddings will be ignored. * Default False -* @li ceil_mode:global_pooling (bool) – (bool) Whether to use the global pooling. -* If global_pooling = true, kernel size and paddings will be ignored. -* Default False \n +* @li ceil_mode: Whether to use the ceil function to calculate output +* height and width. False is the default. If it is set to False, +* the floor function will be used. Default False \n * @par Outputs: * y: A mutable tensor. Has the same shape and type as "x1" . \n diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h index e1c2a20d..f746b3b3 100644 --- a/third_party/fwkacllib/inc/ops/pad_ops.h +++ b/third_party/fwkacllib/inc/ops/pad_ops.h @@ -403,6 +403,5 @@ REG_OP(EmbeddingRankId) .ATTR(mode, String, "mod") .OP_END_FACTORY_REG(EmbeddingRankId) - } // namespace ge #endif // OPS_BUILT_IN_OP_PROTO_INC_PAD_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/target_crop_and_resize.h b/third_party/fwkacllib/inc/ops/target_crop_and_resize.h new file mode 100644 index 00000000..9c61f2c9 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/target_crop_and_resize.h @@ -0,0 +1,59 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! + * \file target_crop_and_resize.h + * \brief + */ +#ifndef GE_OP_TARGET_CROP_AND_RESIZE_H +#define GE_OP_TARGET_CROP_AND_RESIZE_H + +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Performs crop and resize on images. + +*@par Inputs: +*@li x: An NCHW tensor of type uint8, specifying the input to the data layer. +*@li boxes: Crop parameters of type int32. \n +*@li box_index: Batch index parameters of type int32. The batch of the input x to be cropped and resize. \n + +*@par Attributes: +*output_h: A required int, specifying the height of output. \n +*output_w: A required int, specifying the width of output. \n +*input_format: A required string, specifying the input format. \n + +*@par Outputs: +*y: The output tensor of type uint8, format only support NC1HWC0_C04. +*@par Third-party framework compatibility +* It is a custom operator. It has no corresponding operator in Caffe. +* +*@par Restrictions: +*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. +*/ +REG_OP(TargetCropAndResize) + .INPUT(x, TensorType({DT_UINT8})) + .INPUT(boxes, TensorType({DT_INT32})) + .INPUT(box_index, TensorType({DT_INT32})) + .OUTPUT(y, TensorType({DT_UINT8})) + .ATTR(output_h, Int, 224) + .ATTR(output_w, Int, 224) + .ATTR(input_format, String, "YUV420SP_U8") + .OP_END_FACTORY_REG(TargetCropAndResize) +} +#endif //GE_OP_TARGET_CROP_AND_RESIZE_H diff --git a/third_party/fwkacllib/inc/tdt/status.h b/third_party/fwkacllib/inc/tdt/status.h index 185d2b9c..d3d6e7d1 100644 --- a/third_party/fwkacllib/inc/tdt/status.h +++ b/third_party/fwkacllib/inc/tdt/status.h @@ -193,6 +193,7 @@ enum { TDT_HDC_SRV_TYPE_ERROR_CODE, TDT_TSD_CLT_OPEN_FAILED_CODE, TDT_TSD_CLT_CLOSE_FAILED_CODE, + TDT_TSD_CLT_UPDATE_PROFILING_FAILED_CODE, TDT_TSD_CLT_INTERFACE_NOT_SUPPORT_CODE, TDT_SUPERVISOR_ILLEGAL_HEARTBEAT_TIME_CODE, TDT_SUPERVISOR_INOTIFY_READ_SIZE_ERROR_CODE, @@ -697,6 +698,8 @@ TDT_DEF_ERROR_CODE(MODID_HDC_SERVER, TDT_ERROR, TDT_BIND_CPUCORE_FAILED, "thread TDT_DEF_ERROR_CODE(MODID_HDC_SERVER, TDT_ERROR, TDT_HDC_SRV_CLOSED, "hdc server has been closed"); TDT_DEF_ERROR_CODE(MODID_TSD_CLIENT, TDT_ERROR, TDT_TSD_CLT_OPEN_FAILED, "tsd client open failed"); TDT_DEF_ERROR_CODE(MODID_TSD_CLIENT, TDT_ERROR, TDT_TSD_CLT_CLOSE_FAILED, "tsd client close failed"); +TDT_DEF_ERROR_CODE(MODID_TSD_CLIENT, TDT_ERROR, TDT_TSD_CLT_UPDATE_PROFILING_FAILED, + "tsd client update profiling failed"); TDT_DEF_ERROR_CODE(MODID_TSD_CLIENT, TDT_ERROR, TDT_TSD_CLT_INTERFACE_NOT_SUPPORT, "tsd client func not support"); TDT_DEF_ERROR_CODE(MODID_TDT_PREFETCH, TDT_ERROR, TDT_PREFETCH_FILELIST_NOT_EXIST, "tdt filelist open failed"); TDT_DEF_ERROR_CODE(MODID_TDT_PREFETCH, TDT_ERROR, TDT_PREFETCH_SAMPLE_FILE_NOT_FOUND, "tdt sample file is empty"); diff --git a/third_party/fwkacllib/inc/tdt/tsd_client.h b/third_party/fwkacllib/inc/tdt/tsd_client.h index 7886488e..6066a12e 100644 --- a/third_party/fwkacllib/inc/tdt/tsd_client.h +++ b/third_party/fwkacllib/inc/tdt/tsd_client.h @@ -49,7 +49,7 @@ extern "C" { * @li tsd_client.h: Header file where the interface declaration is located. * @li data_common.h: Header file where 'TDT_StatusT' defined */ -TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t rankSize); +TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t rankSize); /** * @ingroup Close @@ -67,7 +67,25 @@ TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t rankSize); * @li tsd_client.h: Header file where the interface declaration is located. * @li data_common.h: Header file where 'TDT_StatusT' defined */ -TDT_StatusT TsdClose(const uint32_t phyDeviceId); +TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t phyDeviceId); + +/** +* @ingroup UpdateProfilingMode +* @brief notify TSDClient update profiling mode +* +* @par Function +* notify TSDClient update profiling mode +* +* @param NA +* @retval TDT_OK Success +* @retval OtherValues Failure +* +* @par Dependency +* @li libtsdclient.so: Library to which the interface belongs. +* @li tsd_client.h: Header file where the interface declaration is located. +* @li data_common.h: Header file where 'TDT_StatusT' defined +*/ +TDT_LIB_EXPORT TDT_StatusT UpdateProfilingMode(const uint32_t phyDeviceId, const uint32_t flag); /** * @ingroup CreateCmdParameterObj