!8323 [Auto parallel] Supporting for-loop in strategy-searching

From: @xiaoda_zh Reviewed-by: Signed-off-by:
5 years ago · 4e07f43dff
parent 9c23fa0a6c aa13d6b1cd
commit 4e07f43dff
20 changed files with 611 additions and 22 deletions
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.cc
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.cc
@ -43,6 +43,7 @@ int64_t RUN_PHASE = DEFAULT_RUN_PHASE;
 bool TRIANGLE_STAR_STRATEGY_OVERWRITE = DEFAULT_TRIANGLE_STAR_STRATEGY_OVERWRITE;
 bool DP_ALGO_ENABLE_APPROX = DEFAULT_DP_ALGO_ENABLE_APPROX;
 double DP_ALGO_APPROX_EPSILON = DEFAULT_DP_ALGO_APPROX_EPSILON;
+bool DP_ALGO_SINGLE_LOOP = DEFAULT_DP_ALGO_SINGLE_LOOP;

 void CostGraph::SetDeviceMemoryAndCostParameter() {
  MS_EXCEPTION_IF_NULL(CostModelContext::GetInstance());
@ -187,6 +188,14 @@ void CostGraph::SetDeviceMemoryAndCostParameter() {
  }
  DP_ALGO_APPROX_EPSILON = epsilon;
  MS_LOG(INFO) << "epsilon: " << epsilon << ".";
+
+  auto single_loop = CostModelContext::GetInstance()->dp_algo_single_loop();
+  DP_ALGO_SINGLE_LOOP = single_loop;
+  if (single_loop) {
+    MS_LOG(INFO) << "dp_algo_single_loop: true.";
+  } else {
+    MS_LOG(INFO) << "dp_algo_single_loop: false.";
+  }
 }

 void CostGraph::RemoveOperator(const OperatorInfoPtr &op) {
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
@ -49,6 +49,7 @@ extern bool DP_ALGO_ENABLE_APPROX;
 extern double DP_ALGO_APPROX_EPSILON;
 extern int64_t RUN_PHASE;
 extern bool TRIANGLE_STAR_STRATEGY_OVERWRITE;
+extern bool DP_ALGO_SINGLE_LOOP;

 class CostGraph {
  // 'CostGraph' consists of Operators and edges between them. An edge is created between two Operators if they have
--- a/mindspore/ccsrc/frontend/parallel/costmodel_context.cc
+++ b/mindspore/ccsrc/frontend/parallel/costmodel_context.cc
@ -56,6 +56,7 @@ void CostModelContext::ResetCostModel() {
  costmodel_allreduce_fusion_allreduce_bandwidth_ = DEFAULT_COST_MODEL_ALLREDUCE_FUSION_ALLREDUCE_BANDWIDTH;
  costmodel_allreduce_fusion_computation_time_parameter_ =
    DEFAULT_COST_MODEL_ALLREDUCE_FUSION_COMPUTATION_TIME_PARAMETER;
+  dp_algo_single_loop_ = DEFAULT_DP_ALGO_SINGLE_LOOP;
 }

 void CostModelContext::ResetAlgoParameters() {
@ -146,6 +147,8 @@ void CostModelContext::set_triangle_star_strategy_overwrite(bool overwrite) {

 void CostModelContext::set_run_phase(int64_t phase) { run_phase_ = phase; }

+void CostModelContext::set_dp_algo_single_loop(bool single_loop) { dp_algo_single_loop_ = single_loop; }
+
 struct CostRegister {
  CostRegister() {
    MsContext::device_seter([](const std::string &device_target) {
--- a/mindspore/ccsrc/frontend/parallel/costmodel_context.h
+++ b/mindspore/ccsrc/frontend/parallel/costmodel_context.h
@ -47,6 +47,7 @@ namespace parallel {
 #define DEFAULT_TRIANGLE_STAR_STRATEGY_OVERWRITE true;
 #define DEFAULT_DP_ALGO_ENABLE_APPROX false
 #define DEFAULT_DP_ALGO_APPROX_EPSILON 0.1
+#define DEFAULT_DP_ALGO_SINGLE_LOOP true

 class CostModelContext {
 public:
@ -149,6 +150,9 @@ class CostModelContext {
  void set_dp_algo_enable_approxi(bool);
  bool dp_algo_enable_approxi() const { return dp_algo_enable_approxi_; }

+  void set_dp_algo_single_loop(bool);
+  bool dp_algo_single_loop() const { return dp_algo_single_loop_; }
+
 private:
  CostModelContext();
  static std::shared_ptr<CostModelContext> cm_context_inst_;
@ -190,6 +194,9 @@ class CostModelContext {
  // When APPROXIMATION is enabled in the DP algorithm, the 'epsilon' value used in the APPROXIMATION.
  double dp_algo_approxi_epsilon_;

+  // Whether to generate a single suite of OperatorInfo for a loop.
+  bool dp_algo_single_loop_;
+
  int64_t run_phase_;  // 0: 'training', 1: 'inference'

  int64_t costmodel_allreduce_fusion_algorithm_;
--- a/mindspore/ccsrc/frontend/parallel/graph_util/graph_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/graph_util/graph_info.cc
@ -14,12 +14,14 @@
 * limitations under the License.
 */

+#include <regex>
 #include "frontend/parallel/graph_util/graph_info.h"
 #include "debug/anf_ir_dump.h"
 #include "debug/anf_ir_utils.h"
 #include "debug/draw.h"
 #include "utils/ms_context.h"
 #include "ir/graph_utils.h"
+#include "pipeline/jit/pipeline.h"

 namespace mindspore {
 namespace parallel {
@ -50,5 +52,26 @@ void DumpGraph(const FuncGraphPtr &root, const std::string &name) {
    ExportIR(name + ".dat", "0", root);
  }
 }
+
+// Return true if the cnode is in a for-loop and loop_index indicates the i-th loop;
+// otherwise return false
+bool GetLoopIndexFromCNode(const CNodePtr &cnode, size_t *loop_index) {
+  std::regex pattern(CELLLIST_KEYWORD_PATTERN);
+  std::smatch result;
+  const auto &cnode_fullname = cnode->fullname_with_scope();
+  if (std::regex_search(cnode_fullname, result, pattern)) {
+    if (result.length() < 2) {
+      MS_LOG(EXCEPTION) << "Wrong format of fullname_with_scope: " << cnode_fullname;
+    }
+    *loop_index = std::stoi(result[1]);
+    return true;
+  }
+  return false;
+}
+
+void SetOpsNumToExecutor(size_t num_ops) {
+  auto executor = pipeline::ExecutorPy::GetInstance();
+  executor->SetNumOpsInfo(num_ops);
+}
 }  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/graph_util/graph_info.h
+++ b/mindspore/ccsrc/frontend/parallel/graph_util/graph_info.h
@ -26,6 +26,8 @@ namespace mindspore {
 namespace parallel {
 std::vector<PrimitivePtr> FindPrimtive(const FuncGraphPtr &graph, const std::string &name);
 void DumpGraph(const FuncGraphPtr &root, const std::string &name);
+bool GetLoopIndexFromCNode(const CNodePtr &cnode, size_t *loop_index);
+void SetOpsNumToExecutor(size_t);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
@ -149,6 +149,7 @@ constexpr char FIELD_SIZE[] = "field_size";
 constexpr char OPTIMIZER_SUB_STRING[] = "optimizer";
 constexpr char DEVICE[] = "Device";
 constexpr char PARALLEL_OPTIMIZER_ALLGATHER[] = "parallel_optimizer_allgather";
+constexpr char CELLLIST_KEYWORD_PATTERN[] = "-CellList/(\\d+)-";

 // Operator
 constexpr char VIRTUAL_DIV[] = "_VirtualDiv";
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
@ -38,6 +38,7 @@
 #include "frontend/parallel/auto_parallel/rec_core/rec_partition.h"
 #include "frontend/parallel/context.h"
 #include "frontend/parallel/graph_util/node_info.h"
+#include "frontend/parallel/graph_util/graph_info.h"
 #include "frontend/parallel/ops_info/reshape_info.h"
 #include "frontend/parallel/ops_info/tmp_identity_info.h"
 #include "frontend/parallel/step_parallel.h"
@ -346,6 +347,39 @@ bool IsAutoParallelCareNode(const CNodePtr &cnode) {
  return IsParallelCareNode(cnode) && IsSplittableOperator(prim->name());
 }

+// Recording the operators appearing in a for-loop.
+// Currently, we assume that the operators in different for-loops are identical, and their traversal
+// orderings are also identical.
+// Therefore, we create OperatorInfo objects for the operators in a loop (say, loop-3), and reuse them in
+// the rest of loops (loop-2, loop-1 and loop-0)
+std::set<std::string> ops_in_a_loop_;
+// Whether two operators are in different loops; if it is true, then return true.
+// If at least one of the two operators is not in the loop, then return false.
+// If two operators are in the same loop, the return false.
+bool IsOperatorsInTwoSeparateLoops(const CNodePtr &a_cnode, const CNodePtr &b_cnode) {
+  auto a_op_info = a_cnode->user_data<OperatorInfo>();
+  MS_EXCEPTION_IF_NULL(a_op_info);
+  auto b_op_info = b_cnode->user_data<OperatorInfo>();
+  MS_EXCEPTION_IF_NULL(b_op_info);
+  if ((ops_in_a_loop_.find(a_op_info->name()) == ops_in_a_loop_.end()) ||
+      (ops_in_a_loop_.find(b_op_info->name()) == ops_in_a_loop_.end())) {
+    return false;
+  }
+  size_t a_loop_index = 0, b_loop_index = 0;
+  const auto &a_fullname = a_cnode->fullname_with_scope();
+  if (!GetLoopIndexFromCNode(a_cnode, &a_loop_index)) {
+    MS_LOG(EXCEPTION) << "The operator with fullname_with_scope: " << a_fullname << " was not included in the set.";
+  }
+  const auto &b_fullname = b_cnode->fullname_with_scope();
+  if (!GetLoopIndexFromCNode(b_cnode, &b_loop_index)) {
+    MS_LOG(EXCEPTION) << "The operator with fullname_with_scope: " << b_fullname << " was not included in the set.";
+  }
+  if (a_loop_index == b_loop_index) {
+    return false;
+  }
+  return true;
+}
+
 OperatorInfoPtr CreateTheOperatorInfo(const PrimitivePtr &prim, const CNodePtr &cnode, StrategyMap *stra_map) {
  MS_EXCEPTION_IF_NULL(prim);
  MS_EXCEPTION_IF_NULL(cnode);
@ -460,6 +494,10 @@ Status ConstructCostGraphNodesByUniqueId(const std::vector<AnfNodePtr> &all_node
  entire_costgraph->SetDeviceMemoryAndCostParameter();
  // The map from CNode's UniqueId to its operatorInfo
  std::map<std::string, OperatorInfoPtr> from_cnode_to_info;
+  // The operator_infos in a loop
+  std::vector<OperatorInfoPtr> operators_in_forloop;
+  // Key: i-th loop; Value: index of 'operators_in_forloop'
+  std::map<size_t, size_t> loop_to_ops;
  // extract strategy from checkpoint for multi-train
  StrategyMap stra_map;
  if (StrategyCheckpoint::GetInstance().LoadCheckPointOn()) {
@ -491,6 +529,27 @@ Status ConstructCostGraphNodesByUniqueId(const std::vector<AnfNodePtr> &all_node

    auto search_cnode = from_cnode_to_info.find(cnode->UniqueId());
    if (search_cnode == from_cnode_to_info.end()) {
+      size_t loop_index = 0;
+      bool is_in_loop = GetLoopIndexFromCNode(cnode, &loop_index);
+      if (DP_ALGO_SINGLE_LOOP && is_in_loop && (loop_to_ops[loop_index] < operators_in_forloop.size())) {
+        const auto &current_op_ptr = operators_in_forloop[loop_to_ops[loop_index]];
+        bool is_find_wrong = (current_op_ptr->name().find(VIRTUAL_DATA_SET_INFO) == std::string::npos) &&
+                             (current_op_ptr->name().find(BATCH_PARALLEL) == std::string::npos) &&
+                             (current_op_ptr->name().find(prim->name()) == std::string::npos);
+        if (is_find_wrong) {
+          MS_LOG(EXCEPTION) << "The OperatorInfo: " << current_op_ptr->name()
+                            << " does not match the Prim: " << prim->name()
+                            << ". The fullname_with_scope: " << cnode->fullname_with_scope();
+        }
+        loop_to_ops[loop_index]++;
+        cnode->set_user_data<OperatorInfo>(current_op_ptr);
+        MS_LOG(INFO) << "The CNode with UniqueId: " << cnode->UniqueId()
+                     << " and UniqueIdThroughCopy: " << cnode->UniqueIdThroughCopy()
+                     << ", CNode fullname_with_scope: " << cnode->fullname_with_scope()
+                     << " is set OperatorInfo: " << current_op_ptr->name() << ", Primitive: " << prim->name();
+        (void)from_cnode_to_info.emplace(std::make_pair(cnode->UniqueId(), current_op_ptr));
+        continue;
+      }
      auto operator_info = CreateTheOperatorInfo(prim, cnode, &stra_map);
      if (operator_info == nullptr) {
        return FAILED;
@ -503,8 +562,14 @@ Status ConstructCostGraphNodesByUniqueId(const std::vector<AnfNodePtr> &all_node
      cnode->set_user_data<OperatorInfo>(operator_info);
      MS_LOG(INFO) << "The CNode with UniqueId: " << cnode->UniqueId()
                   << " and UniqueIdThroughCopy: " << cnode->UniqueIdThroughCopy()
+                   << ", CNode fullname_with_scope: " << cnode->fullname_with_scope()
                   << " is set OperatorInfo: " << operator_info->name() << ", Primitive: " << prim->name();
-      (void)from_cnode_to_info.emplace(std::make_pair(cnode->UniqueIdThroughCopy(), operator_info));
+      (void)from_cnode_to_info.emplace(std::make_pair(cnode->UniqueId(), operator_info));
+      if (DP_ALGO_SINGLE_LOOP && is_in_loop) {
+        operators_in_forloop.push_back(operator_info);
+        ops_in_a_loop_.insert(operator_info->name());
+        loop_to_ops[loop_index]++;
+      }
      // Needed by rec_parser
      entire_costgraph->add_inputs_tensor_name(inputs_tensor_name);
    } else {
@ -526,6 +591,10 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
  entire_costgraph->SetDeviceMemoryAndCostParameter();
  // The map from CNode's UniqueIdThroughCopy to its operatorInfo
  std::map<std::string, OperatorInfoPtr> from_cnode_to_info;
+  // The operator_infos in a loop
+  std::vector<OperatorInfoPtr> operators_in_forloop;
+  // Key: i-th loop; Value: index of 'operators_in_forloop'
+  std::map<size_t, size_t> loop_to_ops;
  // extract strategy from checkpoint for multi-train
  StrategyMap stra_map;
  if (StrategyCheckpoint::GetInstance().LoadCheckPointOn()) {
@ -556,6 +625,27 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
    // Find the operatorInfo if it exists
    auto search_cnode = from_cnode_to_info.find(cnode->UniqueIdThroughCopy());
    if (search_cnode == from_cnode_to_info.end()) {
+      size_t loop_index = 0;
+      bool is_in_loop = GetLoopIndexFromCNode(cnode, &loop_index);
+      if (DP_ALGO_SINGLE_LOOP && is_in_loop && (loop_to_ops[loop_index] < operators_in_forloop.size())) {
+        const auto &current_op_ptr = operators_in_forloop[loop_to_ops[loop_index]];
+        bool is_find_wrong = (current_op_ptr->name().find(VIRTUAL_DATA_SET_INFO) == std::string::npos) &&
+                             (current_op_ptr->name().find(BATCH_PARALLEL) == std::string::npos) &&
+                             (current_op_ptr->name().find(prim->name()) == std::string::npos);
+        if (is_find_wrong) {
+          MS_LOG(EXCEPTION) << "The OperatorInfo: " << current_op_ptr->name()
+                            << " does not match the Prim: " << prim->name()
+                            << ". The fullname_with_scope: " << cnode->fullname_with_scope();
+        }
+        loop_to_ops[loop_index]++;
+        cnode->set_user_data<OperatorInfo>(current_op_ptr);
+        MS_LOG(INFO) << "The CNode with UniqueId: " << cnode->UniqueId()
+                     << " and UniqueIdThroughCopy: " << cnode->UniqueIdThroughCopy()
+                     << ", CNode fullname_with_scope: " << cnode->fullname_with_scope()
+                     << " is set OperatorInfo: " << current_op_ptr->name() << ", Primitive: " << prim->name();
+        (void)from_cnode_to_info.emplace(std::make_pair(cnode->UniqueIdThroughCopy(), current_op_ptr));
+        continue;
+      }
      // In this case, the corresponding OperatorInfo is not created, create the new one.
      auto operator_info = CreateTheOperatorInfo(prim, cnode, &stra_map);
      if (operator_info == nullptr) {
@ -569,8 +659,14 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
      cnode->set_user_data<OperatorInfo>(operator_info);
      MS_LOG(INFO) << "The CNode with UniqueId: " << cnode->UniqueId()
                   << " and UniqueIdThroughCopy: " << cnode->UniqueIdThroughCopy()
+                   << ", CNode fullname_with_scope: " << cnode->fullname_with_scope()
                   << " is set OperatorInfo: " << operator_info->name() << ", Primitive: " << prim->name();
      (void)from_cnode_to_info.emplace(std::make_pair(cnode->UniqueIdThroughCopy(), operator_info));
+      if (DP_ALGO_SINGLE_LOOP && is_in_loop) {
+        operators_in_forloop.push_back(operator_info);
+        ops_in_a_loop_.insert(operator_info->name());
+        loop_to_ops[loop_index]++;
+      }
      // Needed by rec_parser
      entire_costgraph->add_inputs_tensor_name(inputs_tensor_name);
    } else {
@ -642,7 +738,12 @@ void ConstructCostGraphEdges(const std::vector<AnfNodePtr> &all_nodes) {
          }
          EdgePtr edge_ptr;
          MS_LOG(INFO) << "Creating edge: " << edge_name;
-
+          if (IsOperatorsInTwoSeparateLoops(prev_cnode, cnode)) {
+            MS_LOG(INFO) << "prev_cnode_fullname: " << prev_cnode->fullname_with_scope()
+                         << ", cnode_fullname: " << cnode->fullname_with_scope();
+            MS_LOG(INFO) << "The two operators in two separate for-loops, thus skip the edge.";
+            break;
+          }
          bool follow_strategy = (prim->name() == RESHAPE) || (prev_prim->name() == RESHAPE) ||
                                 (ELEMENTWISE_OP_STRA_FOLLOW && IsElementWiseOperator(prev_prim->name()));
          if (follow_strategy) {
@ -1044,8 +1145,11 @@ Status ParallelStrategySearch(const std::vector<AnfNodePtr> &all_nodes, const Fu

  // Step 3: Augment the costgraph.
  AugmentCostGraph(all_nodes);
-  MS_LOG(INFO) << "After the augmenting procedure, there are " << entire_costgraph->GetOperators().size()
-               << " operators, and " << entire_costgraph->GetNumEdges() << " edges.";
+  auto num_ops = entire_costgraph->GetOperators().size();
+  SetOpsNumToExecutor(num_ops);
+  auto num_edges = entire_costgraph->GetNumEdges();
+  MS_LOG(INFO) << "After the augmenting procedure, there are " << num_ops << " operators, and " << num_edges
+               << " edges.";

  // Step 3.1: Calculate the memory usage
  if (entire_costgraph->CalculateMemoryCost() != SUCCESS) {
@ -1071,6 +1175,7 @@ Status ParallelStrategySearch(const std::vector<AnfNodePtr> &all_nodes, const Fu
    MS_LOG(INFO) << op->name() << " : The strategy is:";
    PrintStrategy(s_strategy);
  }
+  ops_in_a_loop_.clear();

  return SUCCESS;
 }
--- a/mindspore/ccsrc/pipeline/jit/init.cc
+++ b/mindspore/ccsrc/pipeline/jit/init.cc
@ -82,6 +82,8 @@ PYBIND11_MODULE(_c_expression, m) {
         "Get Parameter Tensor Layout Dictionary.")
    .def("get_strategy", &ExecutorPy::GetCNodeStrategy, py::arg("phase") = py::str("train"),
         "Get CNode Strategy Dictionary.")
+    .def("get_num_parallel_ops", &ExecutorPy::GetNumOpsInfo, py::arg("phase") = py::str("train"),
+         "Get the number of parallel operators.")
    .def("get_allreduce_fusion", &ExecutorPy::GetAllreduceFusion, py::arg("phase") = py::str("train"),
         "Get Allreduce Fusion Dictionary.")
    .def("fetch_info_for_quant_export", &ExecutorPy::FetchInfoForQuantExport, py::arg("phase") = py::str("train"),
@ -254,6 +256,10 @@ PYBIND11_MODULE(_c_expression, m) {
         "Set the epsilon which is used in the approximation of DP algorithm.")
    .def("get_dp_algo_approxi_epsilon", &CostModelContext::dp_algo_approxi_epsilon,
         "Get the epsilon which is used in the approximation of DP algorithm.")
+    .def("set_dp_algo_single_loop", &CostModelContext::set_dp_algo_single_loop,
+         "Set the flag of generating a single suite of OperatorInfos in for-loop.")
+    .def("get_dp_algo_single_loop", &CostModelContext::dp_algo_single_loop,
+         "Get the flag of whether or not generating a single suite of OperatorInfos in for-loop.")
    .def("reset_cost_model", &CostModelContext::ResetCostModel, "Reset the CostModelContext.")
    .def("reset_algo_parameters", &CostModelContext::ResetAlgoParameters, "Reset the AlgoParameters.");

--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@ -252,6 +252,16 @@ void ExecutorPy::SetCNodeStrategy(const std::string &name, const parallel::Strat
  stra_dict_[phase_][py::str(name)] = strategy;
 }

+size_t ExecutorPy::GetNumOpsInfo(const std::string &phase) {
+  MS_LOG(DEBUG) << "GetNumOpsInfo!";
+  return phase_to_num_op_info_[phase];
+}
+
+void ExecutorPy::SetNumOpsInfo(size_t num_ops) {
+  MS_LOG(DEBUG) << "SetNumOpsInfo!";
+  phase_to_num_op_info_[phase_] = num_ops;
+}
+
 py::dict ExecutorPy::GetAllreduceFusion(const std::string &phase) {
  MS_LOG(INFO) << "GetAllreduceFusion!";
  auto graph = GetFuncGraph(phase);
--- a/mindspore/ccsrc/pipeline/jit/pipeline.h
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.h
@ -93,6 +93,8 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
  py::dict GetParameterLayout(const std::string &phase);
  py::dict GetCNodeStrategy(const std::string &phase);
  void SetCNodeStrategy(const std::string &name, const parallel::Strategys &strategy);
+  size_t GetNumOpsInfo(const std::string &phase);
+  void SetNumOpsInfo(size_t);
  py::dict GetAllreduceFusion(const std::string &phase);
  void DelNetRes(const std::string &id);
  void ReleaseResource(const py::object &phase);
@ -117,6 +119,7 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
  static bool debugger_terminate_;
  std::map<std::string, py::dict> stra_dict_;
  std::string phase_ = "";
+  std::map<std::string, size_t> phase_to_num_op_info_;
 };
 using ExecutorPyPtr = std::shared_ptr<ExecutorPy>;

--- a/mindspore/common/api.py
+++ b/mindspore/common/api.py
@ -455,6 +455,10 @@ class _Executor:
        real_phase = self.phase_prefix + obj.phase + '.' + str(obj.create_time)
        return self._executor.get_strategy(real_phase)

+    def _get_num_parallel_ops(self, obj):
+        real_phase = self.phase_prefix + obj.phase + '.' + str(obj.create_time)
+        return self._executor.get_num_parallel_ops(real_phase)
+
    def _get_allreduce_fusion(self, obj):
        real_phase = self.phase_prefix + obj.phase + '.' + str(obj.create_time)
        return self._executor.get_allreduce_fusion(real_phase)
--- a/mindspore/parallel/_cost_model_context.py
+++ b/mindspore/parallel/_cost_model_context.py
@ -266,6 +266,31 @@ class _CostModelContext:
            raise ValueError("Context handle is none in context!!!")
        return self._context_handle.get_run_phase()

+    def set_dp_algo_single_loop(self, single_loop):
+        """
+        Set the flag of generating a single suite of OperatorInfos in for-loop.
+
+        Args:
+            single_loop (bool): The parameter for the single loop flag.
+
+        Raises:
+            ValueError: If context handle is none.
+        """
+        if self._context_handle is None:
+            raise ValueError("Context handle is none in context!!!")
+        self._context_handle.set_dp_algo_single_loop(single_loop)
+
+    def get_dp_algo_single_loop(self):
+        """
+        Get the flag of whether or not generating a single suite of OperatorInfos in for-loop.
+
+        Raises:
+            ValueError: If context handle is none.
+        """
+        if self._context_handle is None:
+            raise ValueError("Context handle is none in context!!!")
+        return self._context_handle.get_dp_algo_single_loop()
+
    def set_costmodel_allreduce_fusion_algorithm(self, algorithm):
        """
        Set costmodel allreduce fusion algorithm.
@ -602,4 +627,19 @@ def _get_multi_subgraphs():
    """
        Get the flag of ANF graph containing multiple subgraphs.
    """
-    cost_model_context().get_multi_subgraphs()
+    return cost_model_context().get_multi_subgraphs()
+
+def _set_algo_single_loop(single_loop=True):
+    """
+    Set the flag of generating a single suite of OperatorInfos in for-loop.
+
+    Args:
+        single_loop (bool): The parameter for the single loop flag.
+    """
+    cost_model_context().set_dp_algo_single_loop(single_loop)
+
+def _get_algo_single_loop():
+    """
+    Get the flag of whether or not generating a single suite of OperatorInfos in for-loop.
+    """
+    return cost_model_context().get_dp_algo_single_loop()
--- a/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
+++ b/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
@ -78,8 +78,8 @@ def test_auto_parallel_arithmetic():
    b = Tensor(np.ones([64, 128]), dtype=ms.float32)
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
-    expected_strategies = {'Default/network-Net/FloorDiv-op1': [[2, 4], [2, 4]],
-                           'Default/network-Net/MatMul-op0': [[2, 1], [1, 4]]}
+    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[2, 4], [2, 4]],
+                           'Default/network-Net/MatMul-op1': [[2, 1], [1, 4]]}
    assert strategies == expected_strategies


@ -105,8 +105,8 @@ def test_auto_parallel_arithmetic_broadcast_both():
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
-    expected_strategies = {'Default/network-Net/FloorDiv-op1': [[8, 1], [1, 1]],
-                           'Default/network-Net/MatMul-op0': [[8, 1], [1, 1]]}
+    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[8, 1], [1, 1]],
+                           'Default/network-Net/MatMul-op1': [[8, 1], [1, 1]]}
    assert strategies == expected_strategies


@ -132,8 +132,8 @@ def test_auto_parallel_arithmetic_broadcast_right():
    b = Tensor(np.ones([32]), dtype=ms.float32)
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
-    expected_strategies = {'Default/network-Net/FloorDiv-op1': [[4, 2], [2]],
-                           'Default/network-Net/MatMul-op0': [[4, 1], [1, 2]]}
+    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[4, 2], [2]],
+                           'Default/network-Net/MatMul-op1': [[4, 1], [1, 2]]}
    assert strategies == expected_strategies


@ -159,6 +159,6 @@ def test_auto_parallel_arithmetic_broadcast_left():
    b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
    compile_net(net, x, y, b, phase="train")
    strategies = _executor._get_shard_strategy(net)
-    expected_strategies = {'Default/network-Net/FloorDiv-op1': [[4, 2], [1, 4, 2]],
-                           'Default/network-Net/MatMul-op0': [[4, 1], [1, 2]]}
+    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[4, 2], [1, 4, 2]],
+                           'Default/network-Net/MatMul-op1': [[4, 1], [1, 2]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_cast.py
@ -84,9 +84,9 @@ def test_double_star_graph():
    net.set_train()
    _executor.compile(net, x, y, z, w, phase='train')
    strategies = _executor._get_shard_strategy(net)
-    expected_strategies = {'Default/network-Net/Cast-op0': [[8, 1]],
-                           'Default/network-Net/Cast-op1': [[1, 8]],
-                           'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]],
-                           'Default/network-Net/MatMul-op2': [[1, 1], [1, 8]],
-                           'Default/network-Net/MatMul-op4': [[1, 8], [8, 1]]}
+    expected_strategies = {'Default/network-Net/Cast-op1': [[8, 1]],
+                           'Default/network-Net/Cast-op3': [[1, 8]],
+                           'Default/network-Net/MatMul-op2': [[8, 1], [1, 1]],
+                           'Default/network-Net/MatMul-op4': [[1, 1], [1, 8]],
+                           'Default/network-Net/MatMul-op0': [[1, 8], [8, 1]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_for_loop.py
+++ b/tests/ut/python/parallel/test_auto_parallel_for_loop.py
@ -0,0 +1,129 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore as ms
+from mindspore import context, Tensor, Parameter
+from mindspore.nn import Cell
+import mindspore.nn as nn
+from mindspore.ops import operations as P, functional as F
+from mindspore.common.initializer import initializer
+import mindspore.common.dtype as mstype
+from mindspore.common.api import _executor
+from tests.dataset_mock import MindData
+
+
+class Dataset(MindData):
+    def __init__(self, predict, label, length=3):
+        super(Dataset, self).__init__(size=length)
+        self.predict = predict
+        self.label = label
+        self.index = 0
+        self.length = length
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.index >= self.length:
+            raise StopIteration
+        self.index += 1
+        return self.predict, self.label
+
+    def reset(self):
+        self.index = 0
+
+
+class LayerNorm(nn.Cell):
+    def __init__(self, normalized_shape, eps=1e-5):
+        super(LayerNorm, self).__init__()
+        self.gamma = Parameter(initializer('ones', normalized_shape), name="gamma")
+        self.beta = Parameter(initializer('zeros', normalized_shape), name="beta")
+        self.mean = P.ReduceMean(keep_dims=True)
+        self.eps = eps
+        self.sub = P.Sub()
+        self.add = P.TensorAdd()
+        self.mul = P.Mul()
+        self.div = P.RealDiv()
+
+    def construct(self, x):
+        mean = self.mean(x, -1)
+        variance = self.mean(F.square(self.sub(x, mean)))
+        output = self.div(self.sub(x, mean), F.sqrt(self.add(variance, self.eps)))
+        rescaled_output = self.add(self.mul(output, self.gamma), self.beta)
+        return rescaled_output
+
+
+class SubNet(Cell):
+    def __init__(self, index):
+        super().__init__()
+        self.matmul = P.MatMul()
+        self.relu = P.ReLU()
+        self.weight = Parameter(Tensor(np.ones([128, 128]), dtype=ms.float32), "matmul_w"+str(index))
+        self.layernorm1 = LayerNorm((128,)).to_float(mstype.float32)
+
+    def construct(self, x):
+        x = self.layernorm1(x)
+        out = self.matmul(x, self.weight)
+        out = self.relu(out)
+        return out
+
+
+class Net(Cell):
+    def __init__(self, mul_weight, num_layers, strategy1=None, strategy2=None):
+        super().__init__()
+        self.mul = P.Mul().shard(strategy1)
+        self.neg = P.Neg().shard(strategy2)
+        self.mul_weight = Parameter(mul_weight, "w1")
+        self.num_layers = num_layers
+        self.layers = nn.CellList()
+        for i in range(num_layers):
+            self.layers.append(SubNet(i))
+
+    def construct(self, x):
+        for i in range(self.num_layers):
+            x = self.layers[i](x)
+        out = self.mul(x, self.mul_weight)
+        out = self.neg(out)
+        return out
+
+
+class Full(Cell):
+    def __init__(self, mul_weight, num_layers, strategy1=None, strategy2=None):
+        super().__init__()
+        self.network = Net(mul_weight, num_layers, strategy1, strategy2)
+        self.relu = P.ReLU()
+
+    def construct(self, x):
+        out = self.network(x)
+        out = self.relu(out)
+        return out
+
+
+_x = Tensor(np.ones([512, 128]), dtype=ms.float32)
+_b = Tensor(np.ones([32]), dtype=ms.int32)
+_w1 = Tensor(np.ones([512, 128]), dtype=ms.float32)
+
+
+def test_auto_parallel():
+    context.set_context(save_graphs=True)
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
+    net = Full(_w1, 3)
+    net.set_auto_parallel()
+    net.set_train()
+    _executor.compile(net, _x, phase='train')
+    num_ops = _executor._get_num_parallel_ops(net)
+    expected_num = 16
+    assert num_ops == expected_num
--- a/tests/ut/python/parallel/test_auto_parallel_for_loop_multi_subgraph.py
+++ b/tests/ut/python/parallel/test_auto_parallel_for_loop_multi_subgraph.py
@ -0,0 +1,136 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter, ParameterTuple
+from mindspore import context
+from mindspore.common.api import _executor
+from mindspore.nn.optim import Adam, FTRL
+from mindspore.ops import composite as C
+from mindspore.ops import functional as F
+from mindspore.ops import operations as P
+from mindspore.parallel._cost_model_context import _set_multi_subgraphs
+from mindspore.parallel._utils import _reset_op_id as reset_op_id
+
+
+class SubNet(nn.Cell):
+    def __init__(self, index):
+        super().__init__()
+        self.matmul = P.BatchMatMul()
+        self.relu = P.ReLU()
+        self.weight = Parameter(Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32), "matmul_w"+str(index))
+
+    def construct(self, x):
+        out = self.matmul(x, self.weight)
+        out = self.relu(out)
+        return out
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.mul = P.Mul()
+        self.relu = P.ReLU()
+        self.wd = Parameter(Tensor(np.ones([8, 8, 8, 8]).astype(np.float32)), name="wide")
+        self.wt = Parameter(Tensor(np.ones([8, 8, 8, 8]).astype(np.float32)), name="l")
+        self.layers = nn.CellList()
+        for i in range(3):
+            self.layers.append(SubNet(i))
+
+    def construct(self, x):
+        for i in range(3):
+            x = self.layers[i](x)
+        out = self.mul(x, self.wd)
+        out = self.mul(out, self.wt)
+        out = self.relu(out)
+        return out
+
+
+class NetWithLoss(nn.Cell):
+    def __init__(self, network):
+        super(NetWithLoss, self).__init__()
+        self.sum = P.ReduceSum()
+        self.mean = P.ReduceMean()
+        self.net = network
+
+    def construct(self, x):
+        predict = self.net(x)
+        loss1 = self.sum(predict, -1)
+        loss2 = self.mean(predict, -1)
+        return loss1, loss2
+
+
+class IthOutputCell(nn.Cell):
+    def __init__(self, network, output_index):
+        super(IthOutputCell, self).__init__()
+        self.network = network
+        self.output_index = output_index
+
+    def construct(self, x):
+        predict = self.network(x)[self.output_index]
+        return predict
+
+
+class TrainStepWarp(nn.Cell):
+    def __init__(self, network, sens=1000.0):
+        super(TrainStepWarp, self).__init__()
+        self.network = network
+        self.network.set_train()
+        self.trainable_params = network.trainable_params()
+        weights_w = []
+        weights_d = []
+        for params in self.trainable_params:
+            weights_w.append(params)
+            weights_d.append(params)
+        self.weights_w = ParameterTuple(weights_w)
+        self.weights_d = ParameterTuple(weights_d)
+        self.optimizer_w = FTRL(learning_rate=1e-2, params=self.weights_w, l1=1e-8,
+                                l2=1e-8, initial_accum=1.0)
+        self.optimizer_d = Adam(self.weights_d, learning_rate=3.5e-4, eps=1e-8,
+                                loss_scale=sens)
+        self.hyper_map = C.HyperMap()
+        self.grad_w = C.GradOperation(get_by_list=True, sens_param=True)
+        self.grad_d = C.GradOperation(get_by_list=True, sens_param=True)
+        self.sens = sens
+        self.loss_net_w = IthOutputCell(network, output_index=0)
+        self.loss_net_d = IthOutputCell(network, output_index=1)
+
+    def construct(self, x):
+        weights_w = self.weights_w
+        weights_d = self.weights_d
+        loss_w, loss_d = self.network(x)
+        sens_w = P.Fill()(P.DType()(loss_w), P.Shape()(loss_w), self.sens)
+        sens_d = P.Fill()(P.DType()(loss_d), P.Shape()(loss_d), self.sens)
+        grads_w = self.grad_w(self.loss_net_w, weights_w)(x, sens_w)
+        grads_d = self.grad_d(self.loss_net_d, weights_d)(x, sens_d)
+        return F.depend(loss_w, self.optimizer_w(grads_w)), F.depend(loss_d, self.optimizer_d(grads_d))
+
+
+def test_double_subgraphs():
+    context.set_context(save_graphs=True)
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=8, global_rank=0)
+    net = TrainStepWarp(NetWithLoss(Net()))
+    _set_multi_subgraphs()
+    net.set_auto_parallel()
+
+    x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32)
+    reset_op_id()
+    net.set_train()
+    _executor.compile(net, x, phase='train')
+    num_ops = _executor._get_num_parallel_ops(net)
+    expected_num = 7
+    assert expected_num == num_ops
--- a/tests/ut/python/parallel/test_auto_parallel_for_loop_simplify.py
+++ b/tests/ut/python/parallel/test_auto_parallel_for_loop_simplify.py
@ -0,0 +1,101 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore as ms
+from mindspore import context, Tensor, Parameter
+from mindspore.nn import Cell, Momentum
+from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore.train import Model
+from tests.dataset_mock import MindData
+
+
+class Dataset(MindData):
+    def __init__(self, predict, label, length=3):
+        super(Dataset, self).__init__(size=length)
+        self.predict = predict
+        self.label = label
+        self.index = 0
+        self.length = length
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.index >= self.length:
+            raise StopIteration
+        self.index += 1
+        return self.predict, self.label
+
+    def reset(self):
+        self.index = 0
+
+
+class SubNet(Cell):
+    def __init__(self, index):
+        super().__init__()
+        self.matmul = P.MatMul()
+        self.relu = P.ReLU()
+        self.weight = Parameter(Tensor(np.ones([128, 128]), dtype=ms.float32), "matmul_w"+str(index))
+
+    def construct(self, x):
+        out = self.matmul(x, self.weight)
+        out = self.relu(out)
+        return out
+
+
+class Net(Cell):
+    def __init__(self, mul_weight, num_layers, strategy1=None, strategy2=None):
+        super().__init__()
+        self.mul = P.Mul().shard(strategy1)
+        self.neg = P.Neg().shard(strategy2)
+        self.mul_weight = Parameter(mul_weight, "w1")
+        self.num_layers = num_layers
+        self.layers = nn.CellList()
+        for i in range(num_layers):
+            self.layers.append(SubNet(i))
+
+    def construct(self, x):
+        for i in range(self.num_layers):
+            x = self.layers[i](x)
+        out = self.mul(x, self.mul_weight)
+        out = self.neg(out)
+        return out
+
+
+_x = Tensor(np.ones([32, 128]), dtype=ms.float32)
+_b = Tensor(np.ones([32]), dtype=ms.int32)
+_w1 = Tensor(np.ones([512, 128]), dtype=ms.float32)
+
+
+def compile_net(net):
+    context.set_context(save_graphs=True)
+    learning_rate = 0.1
+    momentum = 0.9
+    epoch_size = 2
+    dataset = Dataset(_x, _b)
+    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    opt = Momentum(net.trainable_params(), learning_rate, momentum)
+    model = Model(net, loss, optimizer=opt)
+    model.train(epoch_size, dataset, dataset_sink_mode=False)
+    context.reset_auto_parallel_context()
+
+
+def test_auto_parallel():
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0)
+    net = Net(_w1, 3)
+    compile_net(net)
--- a/tests/ut/python/parallel/test_auto_parallel_transpose.py
+++ b/tests/ut/python/parallel/test_auto_parallel_transpose.py
@ -79,8 +79,8 @@ def test_two_matmul_transpose():
    net.set_train()
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
-    expected_strategies = {'Default/network-Net/Transpose-op3': [[1, 16]],
-                           'Default/network-Net/Transpose-op2': [[16, 1]],
-                           'Default/network-Net/MatMul-op0': [[16, 1], [1, 1]],
-                           'Default/network-Net/MatMul-op1': [[16, 1], [1, 1]]}
+    expected_strategies = {'Default/network-Net/Transpose-op0': [[1, 16]],
+                           'Default/network-Net/Transpose-op1': [[16, 1]],
+                           'Default/network-Net/MatMul-op2': [[16, 1], [1, 1]],
+                           'Default/network-Net/MatMul-op3': [[16, 1], [1, 1]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
@ -22,6 +22,7 @@ from mindspore.common.api import _executor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from mindspore.parallel import _cost_model_context as cost_model_context
+from mindspore.parallel._cost_model_context import _set_algo_single_loop, _get_algo_single_loop
 from mindspore.parallel import set_algo_parameters, get_algo_parameters, reset_algo_parameters
 from mindspore.parallel._utils import _reset_op_id as reset_op_id
 from tests.ut.python.ops.test_math_ops import VirtualLoss
@ -120,6 +121,14 @@ def test_two_matmul():
    algo_epsilon = get_algo_parameters("algo_approxi_epsilon")
    assert algo_epsilon == 0.001

+    expecte_single_loop = True
+    signle_loop = _get_algo_single_loop()
+    assert expecte_single_loop == signle_loop
+    expecte_single_loop = False
+    _set_algo_single_loop(expecte_single_loop)
+    signle_loop = _get_algo_single_loop()
+    assert expecte_single_loop == signle_loop
+
    reset_algo_parameters()
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert not para_slice_align_enable