diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
index f182e8e405..ff1c666c45 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@@ -1512,7 +1512,87 @@ Status ValidStageCheck(const std::vector<int32_t> &stages, int32_t strategy_stag
   }
 }
 
-void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes) {
+// find previous parallel care node.
+bool FindPreNodes(const AnfNodePtr &node, vector<std::string> *unique_ids) {
+  MS_EXCEPTION_IF_NULL(unique_ids);
+  // if previous node is a parameter, handle it in the outsize.
+  if (node->isa<Parameter>()) {
+    return false;
+  }
+  if (!node->isa<CNode>()) {
+    return false;
+  }
+  CNodePtr cnode = node->cast<CNodePtr>();
+  if (!IsValueNode<Primitive>(cnode->input(0))) {
+    return false;
+  }
+  ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
+  PrimitivePtr prim = prim_anf_node->value()->cast<PrimitivePtr>();
+  if (IsParallelCareNode(cnode) && prim->name() != MAKE_TUPLE && prim->name() != MAKE_LIST) {
+    unique_ids->push_back(cnode->UniqueId());
+    return true;
+  }
+  bool find = false;
+  for (size_t index = 0; index < cnode->inputs().size(); ++index) {
+    if (prim->name() == DEPEND && index != 1) {
+      continue;
+    }
+    if (FindPreNodes(cnode->inputs()[index], unique_ids)) {
+      find = true;
+      continue;
+    }
+  }
+  return find;
+}
+
+void FindLastNodesUniqueId(const std::vector<AnfNodePtr> &all_nodes, vector<std::string> *unique_ids) {
+  MS_EXCEPTION_IF_NULL(unique_ids);
+  for (auto &node : all_nodes) {
+    auto cnode = node->cast<CNodePtr>();
+    if ((cnode == nullptr) || !IsValueNode<Primitive>(cnode->input(0))) {
+      continue;
+    }
+    ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
+    PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
+    if (prim->name() == RETURN) {
+      if (!FindPreNodes(cnode, unique_ids)) {
+        MS_LOG(WARNING) << "cannot find the last parallel care node in eval graph";
+      }
+    }
+  }
+}
+
+StrategyPtr GenerateBatchParallelStrategy(const OperatorInfoPtr operator_, const PrimitivePtr prim) {
+  MS_EXCEPTION_IF_NULL(operator_);
+  MS_EXCEPTION_IF_NULL(prim);
+  StrategyPtr strategyPtr;
+  std::shared_ptr<Strategys> strategy_v_ptr = operator_->GenerateBatchStrategies();
+  MS_EXCEPTION_IF_NULL(strategy_v_ptr);
+  strategyPtr = NewStrategy(0, *strategy_v_ptr);
+  std::vector<ValuePtr> elements;
+  for (size_t i = 0; i < strategy_v_ptr->size(); i++) {
+    elements.push_back(MakeValue((*strategy_v_ptr)[i]));
+  }
+  ValueTuplePtr strategy = std::make_shared<ValueTuple>(elements);
+  // display the strategy generated by batch parallel
+  auto attrs = prim->attrs();
+  attrs[GEN_STRATEGY] = strategy;
+  (void)prim->SetAttrs(attrs);
+  MS_LOG(INFO) << "prim " << prim->name() << " batch parallel strategy is " << attrs[GEN_STRATEGY]->ToString();
+  return strategyPtr;
+}
+
+void SetLastNodeStrategy(const StrategyPtr strategyPtr) {
+  auto strategys = strategyPtr->GetInputDim();
+  for (size_t i = 0; i < strategys.size(); ++i) {
+    for (size_t j = 0; j < strategys[i].size(); ++j) {
+      strategys[i][j] = 1;
+    }
+  }
+  strategyPtr->ResetInputs(strategys);
+}
+
+void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes, bool is_training) {
   // load strategy map from checkpoint
   StrategyMap stra_map;
   if (StrategyCheckpoint::GetInstance().LoadCheckPointOn()) {
@@ -1520,7 +1600,11 @@ void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes) {
       MS_LOG(EXCEPTION) << "Load strategy checkpoint failed";
     }
   }
-
+  vector<std::string> last_forward_node_ids;
+  if (!is_training) {
+    FindLastNodesUniqueId(all_nodes, &last_forward_node_ids);
+    MS_LOG(INFO) << "there are " << last_forward_node_ids.size() << " output nodes in eval/predict";
+  }
   // Get global rank after the checkpoint?
   int32_t global_rank = ParallelContext::GetInstance()->global_rank();
   std::vector<int32_t> stages = ParallelContext::GetInstance()->stage();
@@ -1572,30 +1656,22 @@ void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes) {
       }
       bool load_strategy_from_ckpt =
         StrategyCheckpoint::GetInstance().LoadCheckPointOn() && stra_map.find(strategy_key_name) != stra_map.end();
-      if (!StrategyFound(attrs) && !load_strategy_from_ckpt) {
+      bool is_last_nodes = std::find(last_forward_node_ids.begin(), last_forward_node_ids.end(), cnode->UniqueId()) !=
+                           last_forward_node_ids.end();
+      bool full_batch = ParallelContext::GetInstance()->full_batch();
+      if ((is_last_nodes && !full_batch) || (!StrategyFound(attrs) && !load_strategy_from_ckpt)) {
         MS_LOG(INFO) << "ExtractInformation: the strategy of node " << node->ToString() << " prim " << prim->name()
                      << " is empty, using batch parallel";
-        std::shared_ptr<Strategys> strategy_v_ptr = operator_->GenerateBatchStrategies();
-        if (strategy_v_ptr == nullptr) {
-          MS_LOG(EXCEPTION) << "Failure:Generate batch parallel strategy failed";
-        }
-        std::vector<ValuePtr> elements;
-        for (size_t i = 0; i < strategy_v_ptr->size(); i++) {
-          elements.push_back(MakeValue((*strategy_v_ptr)[i]));
-        }
-        ValueTuplePtr strategy = std::make_shared<ValueTuple>(elements);
-        // display the strategy generated by batch parallel
-        attrs[GEN_STRATEGY] = strategy;
-        (void)prim->SetAttrs(attrs);
-        MS_LOG(INFO) << "node " << node->ToString() << " prim " << prim->name() << " batch parallel strategy is "
-                     << attrs[GEN_STRATEGY]->ToString();
-        strategyPtr = NewStrategy(0, *strategy_v_ptr);
+        strategyPtr = GenerateBatchParallelStrategy(operator_, prim);
       } else if (load_strategy_from_ckpt) {
         strategyPtr = stra_map[strategy_key_name];
       } else {
         strategyPtr = ExtractStrategy(attrs);
       }
       if (strategyPtr != nullptr) {
+        if (is_last_nodes && full_batch) {
+          SetLastNodeStrategy(strategyPtr);
+        }
         (*operator_).set_stage_id(strategyPtr->GetInputStage());
         MS_LOG(INFO) << "Extract stage id for op " << prim->name() << " is " << (*operator_).stage_id();
         if (ValidStageCheck(stages, (*operator_).stage_id()) == FAILED) {
@@ -2854,7 +2930,7 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
     }
 
     // extract shape and strategy, set operator_info
-    ExtractInformation(all_nodes);
+    ExtractInformation(all_nodes, root->has_flag(TRAINING));
     ReshapeInit(all_nodes);
   }
 
diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.h b/mindspore/ccsrc/frontend/parallel/step_parallel.h
index 4ff9eef96e..47fb8e78c2 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.h
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.h
@@ -118,7 +118,7 @@ void CoverSliceShape(const FuncGraphPtr &root);
 void SetVirtualDatasetStrategy(const CNodePtr &node);
 
 // Creat parallel operator for primitive node(has strategy)
-void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes);
+void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes, bool is_training = true);
 
 TensorLayout GetInputLayoutFromCNode(const std::pair<AnfNodePtr, int> &node_pair);
 
diff --git a/tests/ut/python/parallel/test_add_relu_redistribution.py b/tests/ut/python/parallel/test_add_relu_redistribution.py
index ac88592399..1efb3acde8 100644
--- a/tests/ut/python/parallel/test_add_relu_redistribution.py
+++ b/tests/ut/python/parallel/test_add_relu_redistribution.py
@@ -59,6 +59,7 @@ class Grad(nn.Cell):
 
 def compile_net(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
diff --git a/tests/ut/python/parallel/test_arithmetic.py b/tests/ut/python/parallel/test_arithmetic.py
index 36e52a993e..2d475945ba 100644
--- a/tests/ut/python/parallel/test_arithmetic.py
+++ b/tests/ut/python/parallel/test_arithmetic.py
@@ -48,6 +48,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
@@ -649,6 +650,7 @@ def test_assign_sub():
 
     def compile_sub_net(net, x):
         net.set_auto_parallel()
+        net.set_train()
         _executor.compile(net, x)
 
     context.set_auto_parallel_context(device_num=64, global_rank=15)
@@ -696,6 +698,7 @@ def test_assign_add():
 
     def compile_sub_net(net, x):
         net.set_auto_parallel()
+        net.set_train()
         _executor.compile(net, x)
 
     context.set_auto_parallel_context(device_num=64, global_rank=15)
@@ -743,6 +746,7 @@ def test_assign():
 
     def compile_sub_net(net, x):
         net.set_auto_parallel()
+        net.set_train()
         _executor.compile(net, x)
 
     context.set_auto_parallel_context(device_num=64, global_rank=15)
diff --git a/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py b/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
index 4f9cd92c3c..07b843b614 100644
--- a/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
+++ b/tests/ut/python/parallel/test_auto_parallel_BN_PReLU.py
@@ -73,4 +73,5 @@ def test_auto_parallel_bn_with_prelu():
 
     net = GradWrap(NetWithLoss(Net()))
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
diff --git a/tests/ut/python/parallel/test_auto_parallel_activation.py b/tests/ut/python/parallel/test_auto_parallel_activation.py
index beaa047f2f..2be2ae3554 100644
--- a/tests/ut/python/parallel/test_auto_parallel_activation.py
+++ b/tests/ut/python/parallel/test_auto_parallel_activation.py
@@ -43,6 +43,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_auto_parallel_arithmetic.py b/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
index fff6447c22..51413f98e5 100644
--- a/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
+++ b/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
@@ -52,6 +52,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b, phase):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b, phase=phase)
 
 
diff --git a/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py b/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
index 530a122cfc..3bd389db7d 100644
--- a/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
+++ b/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
@@ -61,6 +61,7 @@ def test_auto_parallel_assign_sub_with_ref_key():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, phase="train")
     strategies = _executor._get_shard_strategy(net)
     for (k, v) in strategies.items():
diff --git a/tests/ut/python/parallel/test_auto_parallel_cast.py b/tests/ut/python/parallel/test_auto_parallel_cast.py
index 0868f0d871..4dee5c42de 100644
--- a/tests/ut/python/parallel/test_auto_parallel_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_cast.py
@@ -81,6 +81,7 @@ def test_double_star_graph():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, y, z, w, phase='train')
     strategies = _executor._get_shard_strategy(net)
     expected_strategies = {'Default/network-Net/Cast-op0': [[8, 1]],
diff --git a/tests/ut/python/parallel/test_auto_parallel_common_parameter.py b/tests/ut/python/parallel/test_auto_parallel_common_parameter.py
index 9ab8b27406..33059a7a48 100644
--- a/tests/ut/python/parallel/test_auto_parallel_common_parameter.py
+++ b/tests/ut/python/parallel/test_auto_parallel_common_parameter.py
@@ -72,4 +72,5 @@ def test_common_parameter():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z)
diff --git a/tests/ut/python/parallel/test_auto_parallel_double_sources.py b/tests/ut/python/parallel/test_auto_parallel_double_sources.py
index 6ad7858505..9e361e7106 100644
--- a/tests/ut/python/parallel/test_auto_parallel_double_sources.py
+++ b/tests/ut/python/parallel/test_auto_parallel_double_sources.py
@@ -79,6 +79,7 @@ def test_double_source_graph():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z, w, a)
 
 
@@ -114,4 +115,5 @@ def test_double_source_complex_graph():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z, w, a)
diff --git a/tests/ut/python/parallel/test_auto_parallel_double_star.py b/tests/ut/python/parallel/test_auto_parallel_double_star.py
index 5a43159993..b6b43a6d26 100644
--- a/tests/ut/python/parallel/test_auto_parallel_double_star.py
+++ b/tests/ut/python/parallel/test_auto_parallel_double_star.py
@@ -83,4 +83,5 @@ def test_double_star_graph():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z, w, a, b, c)
diff --git a/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py b/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
index 80775e48ac..0431604411 100644
--- a/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
+++ b/tests/ut/python/parallel/test_auto_parallel_double_subgraphs.py
@@ -113,6 +113,7 @@ def test_double_subgraphs():
 
     x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32)
     reset_op_id()
+    net.set_train()
     _executor.compile(net, x, phase='train')
     strategies = _executor._get_shard_strategy(net)
     for (k, v) in strategies.items():
diff --git a/tests/ut/python/parallel/test_auto_parallel_fc_nobias.py b/tests/ut/python/parallel/test_auto_parallel_fc_nobias.py
index 05e57801c0..b64f8fec5d 100644
--- a/tests/ut/python/parallel/test_auto_parallel_fc_nobias.py
+++ b/tests/ut/python/parallel/test_auto_parallel_fc_nobias.py
@@ -70,4 +70,5 @@ def test_two_matmul():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_auto_parallel_four_matmul.py b/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
index c005fcffde..0590ff062b 100644
--- a/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_four_matmul.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, z, w, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z, w, b)
 
     # model_parallel test
diff --git a/tests/ut/python/parallel/test_auto_parallel_l2normalize.py b/tests/ut/python/parallel/test_auto_parallel_l2normalize.py
index 1a1c1502f3..2de76ab7d9 100644
--- a/tests/ut/python/parallel/test_auto_parallel_l2normalize.py
+++ b/tests/ut/python/parallel/test_auto_parallel_l2normalize.py
@@ -73,4 +73,5 @@ def test_auto_parallel_l2normalize():
     x = Tensor(np.ones([128, 64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([128, 64, 64]), dtype=ms.float32)
     b = Tensor(np.ones([128, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b, phase='train')
diff --git a/tests/ut/python/parallel/test_auto_parallel_matmul_drop.py b/tests/ut/python/parallel/test_auto_parallel_matmul_drop.py
index 738614ab5e..35012641f4 100644
--- a/tests/ut/python/parallel/test_auto_parallel_matmul_drop.py
+++ b/tests/ut/python/parallel/test_auto_parallel_matmul_drop.py
@@ -70,4 +70,5 @@ def test_two_matmul_dropout():
     x = Tensor(np.ones([128, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py b/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
index f8c64a4baa..b11836a435 100644
--- a/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
+++ b/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
@@ -74,6 +74,7 @@ def test_matmul_prelu():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, y, b, phase='train')
     strategies = _executor._get_shard_strategy(net)
     for (k, v) in strategies.items():
diff --git a/tests/ut/python/parallel/test_auto_parallel_multi_graph.py b/tests/ut/python/parallel/test_auto_parallel_multi_graph.py
index f510fdedeb..ab71e6cb6f 100644
--- a/tests/ut/python/parallel/test_auto_parallel_multi_graph.py
+++ b/tests/ut/python/parallel/test_auto_parallel_multi_graph.py
@@ -58,6 +58,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, inputs_, label_)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_auto_parallel_onehot.py b/tests/ut/python/parallel/test_auto_parallel_onehot.py
index 03fb233ae4..59c6ed7271 100644
--- a/tests/ut/python/parallel/test_auto_parallel_onehot.py
+++ b/tests/ut/python/parallel/test_auto_parallel_onehot.py
@@ -99,6 +99,7 @@ def test_auto_parallel_arithmetic():
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64]), dtype=ms.int32)
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
index ab84db70d0..0890dfd3c0 100644
--- a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
@@ -68,6 +68,7 @@ def test_common_parameter():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, y, phase='train')
     strategies = _executor._get_shard_strategy(net)
     for (k, v) in strategies.items():
diff --git a/tests/ut/python/parallel/test_auto_parallel_partial_strategy.py b/tests/ut/python/parallel/test_auto_parallel_partial_strategy.py
index 2606a7d302..3011f44b8b 100644
--- a/tests/ut/python/parallel/test_auto_parallel_partial_strategy.py
+++ b/tests/ut/python/parallel/test_auto_parallel_partial_strategy.py
@@ -77,4 +77,5 @@ def test_four_matmul_linear():
     net = GradWrap(NetWithLoss(Net(strategy1)))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z, w, b)
diff --git a/tests/ut/python/parallel/test_auto_parallel_reduce_method.py b/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
index 415ddf94d0..2161b17d6d 100644
--- a/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reduce_method.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_auto_parallel_reshape.py b/tests/ut/python/parallel/test_auto_parallel_reshape.py
index 5a384d874a..479c727475 100644
--- a/tests/ut/python/parallel/test_auto_parallel_reshape.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reshape.py
@@ -68,6 +68,7 @@ def test_reshape_matmul():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_reshape():
@@ -90,6 +91,7 @@ def test_reshape_reshape():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 
@@ -115,6 +117,7 @@ def test_reshape_auto_1():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 
@@ -143,6 +146,7 @@ def test_reshape_auto_2():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 
@@ -168,6 +172,7 @@ def test_reshape_auto_3():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 
@@ -194,6 +199,7 @@ def test_reshape_auto_4():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 
@@ -244,6 +250,7 @@ def test_reshape_auto_5():
     net = GradWrap5(NetWithLoss5(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 def test_reshape_auto_6():
@@ -291,6 +298,7 @@ def test_reshape_auto_6():
     net = GradWrap6(NetWithLoss6(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 def test_reshape_auto_7():
@@ -313,4 +321,5 @@ def test_reshape_auto_7():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
diff --git a/tests/ut/python/parallel/test_auto_parallel_rhombus.py b/tests/ut/python/parallel/test_auto_parallel_rhombus.py
index fb7b6caf6e..2bfc0ee4f9 100644
--- a/tests/ut/python/parallel/test_auto_parallel_rhombus.py
+++ b/tests/ut/python/parallel/test_auto_parallel_rhombus.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py b/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
index 448e322c2a..8334fd893d 100644
--- a/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
+++ b/tests/ut/python/parallel/test_auto_parallel_softmax_loss.py
@@ -66,4 +66,5 @@ def test_softmax_cross_entropy_loss_auto_parallel():
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 32]), dtype=ms.float32)
     b = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_auto_parallel_star_partial_strategy.py b/tests/ut/python/parallel/test_auto_parallel_star_partial_strategy.py
index af3d0ac431..28ec839831 100644
--- a/tests/ut/python/parallel/test_auto_parallel_star_partial_strategy.py
+++ b/tests/ut/python/parallel/test_auto_parallel_star_partial_strategy.py
@@ -88,6 +88,7 @@ def test_star_strategy_consistency1():
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
     reset_op_id()
+    net.set_train()
     _executor.compile(net, x, phase='train')
 
 
@@ -102,6 +103,7 @@ def test_star_strategy_consistency2():
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
     reset_op_id()
+    net.set_train()
     _executor.compile(net, x, phase='train')
 
 
@@ -116,6 +118,7 @@ def test_star_strategy_consistency3():
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
     reset_op_id()
+    net.set_train()
     _executor.compile(net, x, phase='train')
 
 
@@ -131,4 +134,5 @@ def test_star_strategy_consistency4():
     net.set_auto_parallel()
     reset_op_id()
     with pytest.raises(RuntimeError):
+        net.set_train()
         _executor.compile(net, x, phase='train')
diff --git a/tests/ut/python/parallel/test_auto_parallel_transformer.py b/tests/ut/python/parallel/test_auto_parallel_transformer.py
index 4a3d8daa44..196da302b2 100644
--- a/tests/ut/python/parallel/test_auto_parallel_transformer.py
+++ b/tests/ut/python/parallel/test_auto_parallel_transformer.py
@@ -112,4 +112,5 @@ def test_dmnet_train_step():
     net = GradWrap(NetWithLoss(MultiTransformer()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, input_)
diff --git a/tests/ut/python/parallel/test_auto_parallel_transpose.py b/tests/ut/python/parallel/test_auto_parallel_transpose.py
index c41ae9ce1c..d3d44e8f15 100644
--- a/tests/ut/python/parallel/test_auto_parallel_transpose.py
+++ b/tests/ut/python/parallel/test_auto_parallel_transpose.py
@@ -76,6 +76,7 @@ def test_two_matmul_transpose():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, y, b, phase='train')
     strategies = _executor._get_shard_strategy(net)
     expected_strategies = {'Default/network-Net/Transpose-op3': [[1, 16]],
diff --git a/tests/ut/python/parallel/test_auto_parallel_triangle_overwrite.py b/tests/ut/python/parallel/test_auto_parallel_triangle_overwrite.py
index 1436e1361e..5dd825b175 100644
--- a/tests/ut/python/parallel/test_auto_parallel_triangle_overwrite.py
+++ b/tests/ut/python/parallel/test_auto_parallel_triangle_overwrite.py
@@ -70,4 +70,5 @@ def test_triangle_strategy_consistency():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, phase='train')
diff --git a/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py b/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py
index 8ed66b958e..78b58a67d6 100644
--- a/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py
+++ b/tests/ut/python/parallel/test_auto_parallel_tuple_depend.py
@@ -78,4 +78,5 @@ def test_virtual_dataset_3_input():
     x = Tensor(np.ones([128, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 2048]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_auto_parallel_two_matmul.py b/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
index 9ddeb3d8c3..2f9c91625b 100644
--- a/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
@@ -134,6 +134,7 @@ def test_two_matmul():
     net.set_auto_parallel()
     reset_op_id()
 
+    net.set_train()
     _executor.compile(net, x, y, b, phase='train')
     strategies = _executor._get_shard_strategy(net)
     expected_strategies = {'Default/network-Net/MatMul-op0': [[16, 1], [1, 1]],
diff --git a/tests/ut/python/parallel/test_auto_parallel_two_partial_matmul.py b/tests/ut/python/parallel/test_auto_parallel_two_partial_matmul.py
index aa0bfd126a..951f06e9f8 100644
--- a/tests/ut/python/parallel/test_auto_parallel_two_partial_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_partial_matmul.py
@@ -71,4 +71,5 @@ def test_four_matmul_linear():
     net = GradWrap(NetWithLoss(Net(strategy1)))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_auto_parallel_zig_zag.py b/tests/ut/python/parallel/test_auto_parallel_zig_zag.py
index 14affccf50..530e142b13 100644
--- a/tests/ut/python/parallel/test_auto_parallel_zig_zag.py
+++ b/tests/ut/python/parallel/test_auto_parallel_zig_zag.py
@@ -77,4 +77,5 @@ def test_zig_zag_graph():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, z, w, a)
diff --git a/tests/ut/python/parallel/test_auto_star_elimination.py b/tests/ut/python/parallel/test_auto_star_elimination.py
index 7b1945304e..8fd2ad2e19 100644
--- a/tests/ut/python/parallel/test_auto_star_elimination.py
+++ b/tests/ut/python/parallel/test_auto_star_elimination.py
@@ -89,4 +89,5 @@ def test_marin_loss():
     net = GradWrap(NetWithLoss(MarginCE()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_batch_matmul.py b/tests/ut/python/parallel/test_batch_matmul.py
index 87b5116348..c40d4d257a 100644
--- a/tests/ut/python/parallel/test_batch_matmul.py
+++ b/tests/ut/python/parallel/test_batch_matmul.py
@@ -45,6 +45,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_batch_parallel.py b/tests/ut/python/parallel/test_batch_parallel.py
index 962e0ca0f6..91f0f4e7b2 100644
--- a/tests/ut/python/parallel/test_batch_parallel.py
+++ b/tests/ut/python/parallel/test_batch_parallel.py
@@ -108,6 +108,7 @@ def test_batch():
     x = Tensor(np.ones([128, 16, 34, 34]), dtype=ms.float32)
     w1 = Tensor(np.ones([128, 8, 32, 32]), dtype=ms.float32)
     w2 = Tensor(np.ones([128, 64, 24, 24]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, w1, w2)
 
 
diff --git a/tests/ut/python/parallel/test_batch_parallel_dropout.py b/tests/ut/python/parallel/test_batch_parallel_dropout.py
index ba9c1a6933..3a4ed04759 100644
--- a/tests/ut/python/parallel/test_batch_parallel_dropout.py
+++ b/tests/ut/python/parallel/test_batch_parallel_dropout.py
@@ -70,4 +70,5 @@ def test_batch_parallel_dropout():
     x = Tensor(np.ones([128, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_batch_parallel_tensoradd.py b/tests/ut/python/parallel/test_batch_parallel_tensoradd.py
index a81079e8ea..a92b9ee2ba 100644
--- a/tests/ut/python/parallel/test_batch_parallel_tensoradd.py
+++ b/tests/ut/python/parallel/test_batch_parallel_tensoradd.py
@@ -68,4 +68,5 @@ def test_matmul_add():
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_batchnorm_ex_batch_parallel.py b/tests/ut/python/parallel/test_batchnorm_ex_batch_parallel.py
index 249fe60350..ee91d56097 100644
--- a/tests/ut/python/parallel/test_batchnorm_ex_batch_parallel.py
+++ b/tests/ut/python/parallel/test_batchnorm_ex_batch_parallel.py
@@ -73,4 +73,5 @@ def test_two_matmul_batchnorm_ex():
     x = Tensor(np.ones([128, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_broadcast_to.py b/tests/ut/python/parallel/test_broadcast_to.py
index 4159c9710e..450ecdb40d 100644
--- a/tests/ut/python/parallel/test_broadcast_to.py
+++ b/tests/ut/python/parallel/test_broadcast_to.py
@@ -68,6 +68,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x1)
     context.reset_auto_parallel_context()
 
@@ -77,6 +78,7 @@ def compile_net2(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x1, _x2)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_comparison_function_info.py b/tests/ut/python/parallel/test_comparison_function_info.py
index fc74d8ae46..62e9a19bc4 100644
--- a/tests/ut/python/parallel/test_comparison_function_info.py
+++ b/tests/ut/python/parallel/test_comparison_function_info.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_concat.py b/tests/ut/python/parallel/test_concat.py
index cb7875751e..a7ce942c22 100644
--- a/tests/ut/python/parallel/test_concat.py
+++ b/tests/ut/python/parallel/test_concat.py
@@ -84,6 +84,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_dense_matmul.py b/tests/ut/python/parallel/test_dense_matmul.py
index e408c65f84..f98d32d381 100644
--- a/tests/ut/python/parallel/test_dense_matmul.py
+++ b/tests/ut/python/parallel/test_dense_matmul.py
@@ -51,4 +51,5 @@ def test_dmnet_train_step():
     label = Tensor(np.zeros([32, 768]).astype(np.float32))
     net = DenseMutMulNet()
     net = train_step_with_loss_warp(DenseMutMulNet())
+    net.set_train()
     _executor.compile(net, input_, label)
diff --git a/tests/ut/python/parallel/test_different_type_for_div_op.py b/tests/ut/python/parallel/test_different_type_for_div_op.py
index 0a07f08d80..0c894cea9a 100644
--- a/tests/ut/python/parallel/test_different_type_for_div_op.py
+++ b/tests/ut/python/parallel/test_different_type_for_div_op.py
@@ -37,6 +37,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
diff --git a/tests/ut/python/parallel/test_dropout_do_mask.py b/tests/ut/python/parallel/test_dropout_do_mask.py
index c966685b2a..f727105123 100644
--- a/tests/ut/python/parallel/test_dropout_do_mask.py
+++ b/tests/ut/python/parallel/test_dropout_do_mask.py
@@ -54,6 +54,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_element_wise_function.py b/tests/ut/python/parallel/test_element_wise_function.py
index 7d6924fb8e..775d391367 100644
--- a/tests/ut/python/parallel/test_element_wise_function.py
+++ b/tests/ut/python/parallel/test_element_wise_function.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_embeddinglookup.py b/tests/ut/python/parallel/test_embeddinglookup.py
index 01159c0dc1..33c0645126 100644
--- a/tests/ut/python/parallel/test_embeddinglookup.py
+++ b/tests/ut/python/parallel/test_embeddinglookup.py
@@ -66,6 +66,7 @@ def test_embeddinglookup_reducescatter_false():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -77,6 +78,7 @@ def test_embeddinglookup_reducescatter_true():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -88,6 +90,7 @@ def test_embeddinglookup_reducescatter_false_grad():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -100,6 +103,7 @@ def test_embeddinglookup_reducescatter_true_grad():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([8, 32, 8]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -114,6 +118,7 @@ def test_embeddinglookup_semi_auto1():
     net.set_auto_parallel()
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -128,4 +133,5 @@ def test_embeddinglookup_semi_auto2():
     net.set_auto_parallel()
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_eval.py b/tests/ut/python/parallel/test_eval.py
new file mode 100644
index 0000000000..eb777c4d8c
--- /dev/null
+++ b/tests/ut/python/parallel/test_eval.py
@@ -0,0 +1,69 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore as ms
+from mindspore import context, Tensor, Parameter
+from mindspore.common.api import _executor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+
+
+class Net(Cell):
+    def __init__(self, mul_weight, strategy1=None, strategy2=None):
+        super().__init__()
+        self.mul = P.Mul().shard(strategy1)
+        self.neg = P.Neg().shard(strategy2)
+        self.mul_weight = Parameter(mul_weight, "w1")
+
+    def construct(self, x, b):
+        out = self.mul(x, self.mul_weight)
+        out = self.neg(out)
+        return out
+
+
+class EvalNet(Cell):
+    def __init__(self, network, strategy2=None):
+        super().__init__()
+        self.network = network
+        self.relu = P.ReLU().shard(strategy2)
+
+    def construct(self, x, b):
+        out = self.network(x, b)
+        out1 = self.relu(out)
+        return out, out1
+
+
+_x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+_w1 = Tensor(np.ones([64, 64]), dtype=ms.float32)
+_b = Tensor(np.ones([64, 64]), dtype=ms.float32)
+
+
+def test_train_and_eval():
+    context.set_context(save_graphs=True, mode=0)
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=16)
+    strategy1 = ((4, 4), (4, 4))
+    strategy2 = ((4, 4),)
+    net = Net(_w1, strategy1, strategy2)
+    eval_net = EvalNet(net, strategy2=strategy2)
+    net.set_auto_parallel()
+    net.set_train()
+    _executor.compile(net, _x, _b, phase='train', auto_parallel_mode=True)
+
+    eval_net.set_train(mode=False)
+    eval_net.set_auto_parallel()
+    _executor.compile(eval_net, _x, _b, phase='eval', auto_parallel_mode=True)
+
+    context.reset_auto_parallel_context()
diff --git a/tests/ut/python/parallel/test_expand_dims.py b/tests/ut/python/parallel/test_expand_dims.py
index d71a78346b..e93f974f84 100644
--- a/tests/ut/python/parallel/test_expand_dims.py
+++ b/tests/ut/python/parallel/test_expand_dims.py
@@ -58,6 +58,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_forward_graph.py b/tests/ut/python/parallel/test_forward_graph.py
index 8ad7451e6d..4780f0cd07 100644
--- a/tests/ut/python/parallel/test_forward_graph.py
+++ b/tests/ut/python/parallel/test_forward_graph.py
@@ -41,6 +41,7 @@ _b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
 
 def compile_net(net):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_gather_v2.py b/tests/ut/python/parallel/test_gather_v2.py
index 9e845f5a58..b950d8b43a 100644
--- a/tests/ut/python/parallel/test_gather_v2.py
+++ b/tests/ut/python/parallel/test_gather_v2.py
@@ -71,6 +71,7 @@ def test_gatherv2_semi_auto0():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -83,6 +84,7 @@ def test_gatherv2_semi_auto1():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -95,6 +97,7 @@ def test_gatherv2_semi_auto2():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -107,6 +110,7 @@ def test_gatherv2_semi_auto3():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -119,6 +123,7 @@ def test_gatherv2_semi_auto4():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -131,6 +136,7 @@ def test_gatherv2_semi_auto5():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -142,6 +148,7 @@ def test_gatherv2_semi_auto6():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -153,6 +160,7 @@ def test_gatherv2_semi_auto7():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -165,6 +173,7 @@ def test_gatherv2_semi_auto8():
 
     x = Tensor(np.ones([64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -174,6 +183,7 @@ def test_gatherv2_auto0():
     net.set_auto_parallel()
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -183,4 +193,5 @@ def test_gatherv2_auto1():
     net.set_auto_parallel()
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_gpu_dropout.py b/tests/ut/python/parallel/test_gpu_dropout.py
index 148f0184f2..2dabc207c6 100644
--- a/tests/ut/python/parallel/test_gpu_dropout.py
+++ b/tests/ut/python/parallel/test_gpu_dropout.py
@@ -65,6 +65,7 @@ def test_dropout_semi_auto():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 128]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -77,6 +78,7 @@ def test_dropout_semi_auto2():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 128]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -89,6 +91,7 @@ def test_dropout_semi_auto3():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 128]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -99,4 +102,5 @@ def test_dropout_auto():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 128]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_hybird_parallel_activation.py b/tests/ut/python/parallel/test_hybird_parallel_activation.py
index 8ff335e059..87552aed46 100644
--- a/tests/ut/python/parallel/test_hybird_parallel_activation.py
+++ b/tests/ut/python/parallel/test_hybird_parallel_activation.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_initializer_weight_slice.py b/tests/ut/python/parallel/test_initializer_weight_slice.py
index 85faf9fc21..cd68b9ebf4 100644
--- a/tests/ut/python/parallel/test_initializer_weight_slice.py
+++ b/tests/ut/python/parallel/test_initializer_weight_slice.py
@@ -53,6 +53,7 @@ def check_initializer_weight_slice(init_name="Uniform"):
         weight = initializer(init_name, [64, 32], ms.float32)
         net = Net(strategy1, strategy2, weight)
         net.set_auto_parallel()
+        net.set_train()
         exe.compile(net, x, auto_parallel_mode=True, phase='train')
         hccl.rank_id = rank_save
         return net.parameters_dict()['w1'].data.asnumpy()
@@ -131,6 +132,7 @@ def test_check_initializer_weight_slice_seed(init_name="Uniform"):
         weight = initializer(init_name, [64, 32], ms.float32)
         net = Net(strategy1, strategy2, weight)
         net.set_auto_parallel()
+        net.set_train()
         exe.compile(net, x, auto_parallel_mode=True, phase='train')
         hccl.rank_id = rank_save
         return net.parameters_dict()['w1'].data.asnumpy()
diff --git a/tests/ut/python/parallel/test_l2normalize.py b/tests/ut/python/parallel/test_l2normalize.py
index 850e71eb93..ff1d4f8924 100644
--- a/tests/ut/python/parallel/test_l2normalize.py
+++ b/tests/ut/python/parallel/test_l2normalize.py
@@ -75,4 +75,5 @@ def test_l2normalize_matmul():
     x = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
     y = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([128, 32, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_layer_norm.py b/tests/ut/python/parallel/test_layer_norm.py
index 78a019a80c..50c30002a8 100644
--- a/tests/ut/python/parallel/test_layer_norm.py
+++ b/tests/ut/python/parallel/test_layer_norm.py
@@ -52,6 +52,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_linear.py b/tests/ut/python/parallel/test_linear.py
index 1b3cecad67..d368b5a033 100644
--- a/tests/ut/python/parallel/test_linear.py
+++ b/tests/ut/python/parallel/test_linear.py
@@ -73,4 +73,5 @@ def test_linear():
     y = Tensor(np.ones([64, 32]), dtype=ms.float32)
     bias = Tensor(np.ones([64]), dtype=ms.float32)
     label = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, bias, label)
diff --git a/tests/ut/python/parallel/test_loop_two_matmul.py b/tests/ut/python/parallel/test_loop_two_matmul.py
index 9c36bff0d1..5c162a6d8f 100644
--- a/tests/ut/python/parallel/test_loop_two_matmul.py
+++ b/tests/ut/python/parallel/test_loop_two_matmul.py
@@ -95,5 +95,6 @@ def test_two_matmul():
             net = GradWrap(NetWithLoss(Net(strategy1, strategy2)))
             context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
             net.set_auto_parallel()
+            net.set_train()
             _executor.compile(net, x, y, b)
             count = count + 1
diff --git a/tests/ut/python/parallel/test_loss_and_optimizer.py b/tests/ut/python/parallel/test_loss_and_optimizer.py
index 215c6dd8d2..03c641e59a 100644
--- a/tests/ut/python/parallel/test_loss_and_optimizer.py
+++ b/tests/ut/python/parallel/test_loss_and_optimizer.py
@@ -37,6 +37,7 @@ class NetWithLoss(nn.Cell):
 
 def compile_net(net, x, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, b)
 
 
diff --git a/tests/ut/python/parallel/test_manual_embedding_lookup.py b/tests/ut/python/parallel/test_manual_embedding_lookup.py
index 22741f8695..542348946f 100644
--- a/tests/ut/python/parallel/test_manual_embedding_lookup.py
+++ b/tests/ut/python/parallel/test_manual_embedding_lookup.py
@@ -67,6 +67,7 @@ def compile_net(net):
     optimizer.sparse_opt.add_prim_attr("primitive_target", "CPU")
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b, auto_parallel_mode=True)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_manual_gatherv2.py b/tests/ut/python/parallel/test_manual_gatherv2.py
index 1d7ffddc7d..dd563bc52a 100644
--- a/tests/ut/python/parallel/test_manual_gatherv2.py
+++ b/tests/ut/python/parallel/test_manual_gatherv2.py
@@ -64,6 +64,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b, auto_parallel_mode=True)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_matmul_dropout.py b/tests/ut/python/parallel/test_matmul_dropout.py
index 70718c7f5c..98f955935e 100644
--- a/tests/ut/python/parallel/test_matmul_dropout.py
+++ b/tests/ut/python/parallel/test_matmul_dropout.py
@@ -75,4 +75,5 @@ def test_two_matmul_dropout():
     x = Tensor(np.ones([128, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y, b)
diff --git a/tests/ut/python/parallel/test_matmul_tensor.py b/tests/ut/python/parallel/test_matmul_tensor.py
index 64359d7caa..12924a7275 100644
--- a/tests/ut/python/parallel/test_matmul_tensor.py
+++ b/tests/ut/python/parallel/test_matmul_tensor.py
@@ -51,6 +51,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
diff --git a/tests/ut/python/parallel/test_mix_precision_hybrid_parallel.py b/tests/ut/python/parallel/test_mix_precision_hybrid_parallel.py
index 867246e97a..d30d27bf19 100644
--- a/tests/ut/python/parallel/test_mix_precision_hybrid_parallel.py
+++ b/tests/ut/python/parallel/test_mix_precision_hybrid_parallel.py
@@ -87,4 +87,5 @@ def test_two_matmul():
     b = Tensor(np.ones([32, 64]), dtype=ms.float32)
     z = Tensor(np.ones([64, 64]), dtype=ms.float32)
 
+    net.set_train()
     _executor.compile(net, x, y, b, z)
diff --git a/tests/ut/python/parallel/test_neg.py b/tests/ut/python/parallel/test_neg.py
index 28dac24ab5..6afcfe251f 100644
--- a/tests/ut/python/parallel/test_neg.py
+++ b/tests/ut/python/parallel/test_neg.py
@@ -43,6 +43,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_one_hot_net.py b/tests/ut/python/parallel/test_one_hot_net.py
index 9f8eebf915..8ba68e1fe5 100644
--- a/tests/ut/python/parallel/test_one_hot_net.py
+++ b/tests/ut/python/parallel/test_one_hot_net.py
@@ -278,6 +278,7 @@ def test_bn_reshape_dense_bn_train_loss():
     net = GradWrap(NetWithLoss(BNReshapeDenseBNNet()))
     net.set_auto_parallel()
 
+    net.set_train()
     _executor.compile(net, input_, label)
 
 
@@ -292,6 +293,7 @@ def test_semi_one_hot_net_batch():
     net = GradWrap(NetWithLoss(net))
     net.set_auto_parallel()
 
+    net.set_train()
     _executor.compile(net, input_, label)
 
 
diff --git a/tests/ut/python/parallel/test_one_weight_parameter.py b/tests/ut/python/parallel/test_one_weight_parameter.py
index 8bd83148b4..6cffd4e7dd 100644
--- a/tests/ut/python/parallel/test_one_weight_parameter.py
+++ b/tests/ut/python/parallel/test_one_weight_parameter.py
@@ -76,4 +76,5 @@ def test_one_weight_parameter():
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     train_net.set_auto_parallel()
 
+    train_net.set_train()
     _executor.compile(train_net, x, b)
diff --git a/tests/ut/python/parallel/test_onehot.py b/tests/ut/python/parallel/test_onehot.py
index 26a77f40ea..d39fe28ff6 100644
--- a/tests/ut/python/parallel/test_onehot.py
+++ b/tests/ut/python/parallel/test_onehot.py
@@ -78,6 +78,7 @@ def compile_graph(strategy1, strategy2, strategy3, strategy4, auto=False, onthot
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64]), dtype=ms.int32)
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_pack.py b/tests/ut/python/parallel/test_pack.py
index ccc8356703..51ee607a65 100644
--- a/tests/ut/python/parallel/test_pack.py
+++ b/tests/ut/python/parallel/test_pack.py
@@ -84,6 +84,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x)
     context.reset_auto_parallel_context()
 
@@ -93,6 +94,7 @@ def compile_net1(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x1)
     context.reset_auto_parallel_context()
 
@@ -102,6 +104,7 @@ def compile_net2(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x2)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_parallel_optimizer.py b/tests/ut/python/parallel/test_parallel_optimizer.py
index e0f214878c..21f39346c3 100644
--- a/tests/ut/python/parallel/test_parallel_optimizer.py
+++ b/tests/ut/python/parallel/test_parallel_optimizer.py
@@ -76,6 +76,7 @@ def auto_parallel_compile_net(mode, dev_num, strategy1=None, strategy2=None):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_network = TrainOneStepCell(net, optimizer)
     train_network.set_auto_parallel()
+    train_network.set_train()
     _executor.compile(train_network, inputs, label)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_parameter_multi_users.py b/tests/ut/python/parallel/test_parameter_multi_users.py
index 051af762c3..d0f01195c9 100644
--- a/tests/ut/python/parallel/test_parameter_multi_users.py
+++ b/tests/ut/python/parallel/test_parameter_multi_users.py
@@ -56,6 +56,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_pipeline_parallel.py b/tests/ut/python/parallel/test_pipeline_parallel.py
index d20c77c98a..3f8147d2e9 100644
--- a/tests/ut/python/parallel/test_pipeline_parallel.py
+++ b/tests/ut/python/parallel/test_pipeline_parallel.py
@@ -74,6 +74,7 @@ def test_gatherv2_semi_samestage1():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 def test_gatherv2_semi_samestage2():
@@ -86,4 +87,5 @@ def test_gatherv2_semi_samestage2():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_prelu.py b/tests/ut/python/parallel/test_prelu.py
index 7ac0c3cf7b..4732823781 100644
--- a/tests/ut/python/parallel/test_prelu.py
+++ b/tests/ut/python/parallel/test_prelu.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -166,6 +167,7 @@ def test_prelu_parallel_success3():
     w = Tensor(np.random.rand(16), dtype=ms.float32)
     net = GradWrap3(NetWithLoss3(Net(strategy1, strategy2)))
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, w)
 
 
diff --git a/tests/ut/python/parallel/test_reduce_method_info.py b/tests/ut/python/parallel/test_reduce_method_info.py
index 7d2c100f65..6e369ff134 100644
--- a/tests/ut/python/parallel/test_reduce_method_info.py
+++ b/tests/ut/python/parallel/test_reduce_method_info.py
@@ -69,11 +69,13 @@ class GradWrap(nn.Cell):
 
 def compile_net_no_bias(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_repeated_calc.py b/tests/ut/python/parallel/test_repeated_calc.py
index 5ce133a759..8e1d7f3c48 100644
--- a/tests/ut/python/parallel/test_repeated_calc.py
+++ b/tests/ut/python/parallel/test_repeated_calc.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_reshape.py b/tests/ut/python/parallel/test_reshape.py
index cb2aac7109..00b25c1afc 100644
--- a/tests/ut/python/parallel/test_reshape.py
+++ b/tests/ut/python/parallel/test_reshape.py
@@ -317,6 +317,7 @@ class ReshapeNet6(nn.Cell):
 
 def compile_net(net, input_):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, input_)
 
 
diff --git a/tests/ut/python/parallel/test_reshape_optimized.py b/tests/ut/python/parallel/test_reshape_optimized.py
index 74b4c0024d..e3b3003bfd 100644
--- a/tests/ut/python/parallel/test_reshape_optimized.py
+++ b/tests/ut/python/parallel/test_reshape_optimized.py
@@ -44,6 +44,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_reshape_parameter.py b/tests/ut/python/parallel/test_reshape_parameter.py
index 3b23c4d13a..9d6740cad2 100644
--- a/tests/ut/python/parallel/test_reshape_parameter.py
+++ b/tests/ut/python/parallel/test_reshape_parameter.py
@@ -63,6 +63,7 @@ class Net(nn.Cell):
 
 def compile_net(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
diff --git a/tests/ut/python/parallel/test_reshape_skip_redistribution.py b/tests/ut/python/parallel/test_reshape_skip_redistribution.py
index 29c0144301..cb9d0a121c 100644
--- a/tests/ut/python/parallel/test_reshape_skip_redistribution.py
+++ b/tests/ut/python/parallel/test_reshape_skip_redistribution.py
@@ -47,6 +47,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_reshape_unexpand.py b/tests/ut/python/parallel/test_reshape_unexpand.py
index d0144bebdf..137fe66a58 100644
--- a/tests/ut/python/parallel/test_reshape_unexpand.py
+++ b/tests/ut/python/parallel/test_reshape_unexpand.py
@@ -67,6 +67,7 @@ def test_reshape_unexpand():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_1():
@@ -89,6 +90,7 @@ def test_reshape_unexpand_1():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_2():
@@ -111,6 +113,7 @@ def test_reshape_unexpand_2():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_3():
@@ -134,6 +137,7 @@ def test_reshape_unexpand_3():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_4():
@@ -157,6 +161,7 @@ def test_reshape_unexpand_4():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_5():
@@ -180,6 +185,7 @@ def test_reshape_unexpand_5():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_6():
@@ -203,6 +209,7 @@ def test_reshape_unexpand_6():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_7():
@@ -235,6 +242,7 @@ def test_reshape_unexpand_7():
     x = Tensor(np.ones([32, 3, 224, 224]), dtype=ms.float32)
     net = GradWrap(NetWithLoss(Net()))
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 def test_reshape_unexpand_8():
@@ -257,4 +265,5 @@ def test_reshape_unexpand_8():
     net = GradWrap(NetWithLoss(Net()))
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
diff --git a/tests/ut/python/parallel/test_scalar_loss.py b/tests/ut/python/parallel/test_scalar_loss.py
index 1043b2997c..f25e7737ba 100644
--- a/tests/ut/python/parallel/test_scalar_loss.py
+++ b/tests/ut/python/parallel/test_scalar_loss.py
@@ -60,4 +60,5 @@ def test_sum_as_loss():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_self_attention.py b/tests/ut/python/parallel/test_self_attention.py
index a484e1fd63..abba5b84cf 100644
--- a/tests/ut/python/parallel/test_self_attention.py
+++ b/tests/ut/python/parallel/test_self_attention.py
@@ -52,6 +52,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x)
 
 
diff --git a/tests/ut/python/parallel/test_semi_auto_two_subgraphs.py b/tests/ut/python/parallel/test_semi_auto_two_subgraphs.py
index 85fc37c497..97aa17b26a 100644
--- a/tests/ut/python/parallel/test_semi_auto_two_subgraphs.py
+++ b/tests/ut/python/parallel/test_semi_auto_two_subgraphs.py
@@ -107,4 +107,5 @@ def test_two_subgraphs():
     net = TrainStepWrap(NetWithLoss(Net()))
     input_x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32)
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, input_x)
diff --git a/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py b/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
index 0311f82462..d8e00c32c3 100644
--- a/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
+++ b/tests/ut/python/parallel/test_sigmoid_cross_entropy_with_logits.py
@@ -43,6 +43,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py b/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
index 69d1f7c47c..c48e0dbbf5 100644
--- a/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
+++ b/tests/ut/python/parallel/test_softmax_cross_entropy_loss.py
@@ -48,6 +48,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_sparse_feature_bprop.py b/tests/ut/python/parallel/test_sparse_feature_bprop.py
index 78dcd6dacb..1ba968d62b 100644
--- a/tests/ut/python/parallel/test_sparse_feature_bprop.py
+++ b/tests/ut/python/parallel/test_sparse_feature_bprop.py
@@ -60,6 +60,7 @@ def test_bprop_with_sparse_feature_allreduce():
     net = GradWrap(Net())
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
 
+    net.set_train()
     _executor.compile(net, x)
 
 
@@ -87,6 +88,7 @@ def test_bprop_with_sparse_feature_mirror():
     def compile_net(net):
         optimizer = Adam(net.trainable_params(), learning_rate=0.1, loss_scale=1024.0, weight_decay=0.9)
         train_net = TrainOneStepCell(net, optimizer)
+        train_net.set_train()
         _executor.compile(train_net, _x, _b)
 
     net = Net()
@@ -119,6 +121,7 @@ def test_bprop_with_sparse_feature_dataparallel():
     def compile_net(net):
         optimizer = Adam(net.trainable_params(), learning_rate=0.1, loss_scale=1024.0, weight_decay=0.9)
         train_net = TrainOneStepCell(net, optimizer)
+        train_net.set_train()
         _executor.compile(train_net, _x, _b)
 
     net = Net()
diff --git a/tests/ut/python/parallel/test_sparse_gather_v2.py b/tests/ut/python/parallel/test_sparse_gather_v2.py
index 2250e49320..e80f3c0832 100644
--- a/tests/ut/python/parallel/test_sparse_gather_v2.py
+++ b/tests/ut/python/parallel/test_sparse_gather_v2.py
@@ -72,6 +72,7 @@ def test_gatherv2_semi_auto0():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -84,6 +85,7 @@ def test_gatherv2_semi_auto1():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -96,6 +98,7 @@ def test_gatherv2_semi_auto2():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -108,6 +111,7 @@ def test_gatherv2_semi_auto3():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -120,6 +124,7 @@ def test_gatherv2_semi_auto4():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -132,6 +137,7 @@ def test_gatherv2_semi_auto5():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -143,6 +149,7 @@ def test_gatherv2_semi_auto6():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -154,6 +161,7 @@ def test_gatherv2_semi_auto7():
 
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -163,6 +171,7 @@ def test_gatherv2_auto0():
     net.set_auto_parallel()
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -172,6 +181,7 @@ def test_gatherv2_auto1():
     net.set_auto_parallel()
     x = Tensor(np.ones([64, 32]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -184,6 +194,7 @@ def test_gatherv2_cpu0():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -196,6 +207,7 @@ def test_gatherv2_cpu1():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
 
 
@@ -208,4 +220,5 @@ def test_gatherv2_cpu2():
 
     x = Tensor(np.ones([64, 64]), dtype=ms.float32)
     y = Tensor(np.ones([64, 64, 64]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x, y)
diff --git a/tests/ut/python/parallel/test_split.py b/tests/ut/python/parallel/test_split.py
index 0ebf22fdff..a7210ab77c 100644
--- a/tests/ut/python/parallel/test_split.py
+++ b/tests/ut/python/parallel/test_split.py
@@ -79,6 +79,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x)
     context.reset_auto_parallel_context()
 
@@ -88,6 +89,7 @@ def compile_net1(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x1)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_split_grad_sens.py b/tests/ut/python/parallel/test_split_grad_sens.py
index 2d57604a94..077dd9540e 100644
--- a/tests/ut/python/parallel/test_split_grad_sens.py
+++ b/tests/ut/python/parallel/test_split_grad_sens.py
@@ -66,10 +66,12 @@ class GradWrap4(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 def compile_net_no_bias(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 def test_no_grad():
@@ -120,6 +122,7 @@ def test_grad_sens_parameter_type():
 
     sens = Tensor(np.ones([128, 64]), dtype=ms.float32)
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b, sens, phase='train', auto_parallel_mode=True)
     x_layout = ([8, 8], [1, -1], [16, 32], 0, True, '')
     y_layout = ([8, 8], [-1, 0], [32, 8], 0, True, '')
diff --git a/tests/ut/python/parallel/test_square.py b/tests/ut/python/parallel/test_square.py
index 823a21ad1f..a354395c7d 100644
--- a/tests/ut/python/parallel/test_square.py
+++ b/tests/ut/python/parallel/test_square.py
@@ -45,6 +45,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_squeeze_info.py b/tests/ut/python/parallel/test_squeeze_info.py
index 1edee94552..76bedeb000 100644
--- a/tests/ut/python/parallel/test_squeeze_info.py
+++ b/tests/ut/python/parallel/test_squeeze_info.py
@@ -39,6 +39,7 @@ _b = Tensor(np.ones([64, 32]), dtype=ms.float32)
 
 def compile_net(net):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_step_parallel.py b/tests/ut/python/parallel/test_step_parallel.py
index db3bab1ab0..5eb9ba7157 100644
--- a/tests/ut/python/parallel/test_step_parallel.py
+++ b/tests/ut/python/parallel/test_step_parallel.py
@@ -76,4 +76,5 @@ def test_two_matmul():
     b = Tensor(np.ones([128, 128]), dtype=ms.float32)
     a = Tensor(np.ones([128, 128]), dtype=ms.float32)
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b, a)
diff --git a/tests/ut/python/parallel/test_strategy_checkpoint.py b/tests/ut/python/parallel/test_strategy_checkpoint.py
index 31e0c20034..9a2db97951 100644
--- a/tests/ut/python/parallel/test_strategy_checkpoint.py
+++ b/tests/ut/python/parallel/test_strategy_checkpoint.py
@@ -87,6 +87,7 @@ def test_six_matmul_save():
     net.set_auto_parallel()
     x1 = Tensor(np.ones([32, 32]), dtype=ms.float32)
     x6 = Tensor(np.ones([128, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x1, x6)
 
 
@@ -149,6 +150,7 @@ def test_six_matmul_load():
     x1 = Tensor(np.ones([32, 32]), dtype=ms.float32)
     x6 = Tensor(np.ones([128, 32]), dtype=ms.float32)
     x7 = Tensor(np.ones([32, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x1, x6, x7)
 
 
@@ -205,6 +207,7 @@ def test_six_matmul_save_auto():
     net.set_auto_parallel()
     x1 = Tensor(np.ones([32, 32]), dtype=ms.float32)
     x6 = Tensor(np.ones([128, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x1, x6)
 
 
@@ -265,4 +268,5 @@ def test_six_matmul_load_auto():
     x1 = Tensor(np.ones([32, 32]), dtype=ms.float32)
     x6 = Tensor(np.ones([128, 32]), dtype=ms.float32)
     x7 = Tensor(np.ones([32, 32]), dtype=ms.float32)
+    net.set_train()
     _executor.compile(net, x1, x6, x7)
diff --git a/tests/ut/python/parallel/test_stridedslice.py b/tests/ut/python/parallel/test_stridedslice.py
index 828b7f80ed..acf2344699 100644
--- a/tests/ut/python/parallel/test_stridedslice.py
+++ b/tests/ut/python/parallel/test_stridedslice.py
@@ -71,6 +71,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_sum_as_loss.py b/tests/ut/python/parallel/test_sum_as_loss.py
index 60162cb6e6..35a5197385 100644
--- a/tests/ut/python/parallel/test_sum_as_loss.py
+++ b/tests/ut/python/parallel/test_sum_as_loss.py
@@ -37,6 +37,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y)
 
 
diff --git a/tests/ut/python/parallel/test_tile.py b/tests/ut/python/parallel/test_tile.py
index 14cfdfb59c..7cae9f68b5 100644
--- a/tests/ut/python/parallel/test_tile.py
+++ b/tests/ut/python/parallel/test_tile.py
@@ -64,6 +64,7 @@ def compile_net(net):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, _x, _b)
     context.reset_auto_parallel_context()
 
diff --git a/tests/ut/python/parallel/test_train_and_eval.py b/tests/ut/python/parallel/test_train_and_eval.py
index a851e9c318..1cbccbf959 100644
--- a/tests/ut/python/parallel/test_train_and_eval.py
+++ b/tests/ut/python/parallel/test_train_and_eval.py
@@ -34,7 +34,7 @@ class Net(Cell):
         return out
 
 
-class EvalNet(Cell):
+class   EvalNet(Cell):
     def __init__(self, network, strategy2=None):
         super().__init__()
         self.network = network
@@ -46,9 +46,9 @@ class EvalNet(Cell):
         return out
 
 
-_x = Tensor(np.ones([8, 8]), dtype=ms.float32)
-_w1 = Tensor(np.ones([8, 8]), dtype=ms.float32)
-_b = Tensor(np.ones([8, 8]), dtype=ms.float32)
+_x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+_w1 = Tensor(np.ones([64, 64]), dtype=ms.float32)
+_b = Tensor(np.ones([64, 64]), dtype=ms.float32)
 
 
 def test_train_and_eval():
@@ -58,8 +58,8 @@ def test_train_and_eval():
     strategy2 = ((4, 4),)
     net = Net(_w1, strategy1, strategy2)
     eval_net = EvalNet(net, strategy2=strategy2)
-    net.set_train()
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, _x, _b, phase='train', auto_parallel_mode=True)
 
     eval_net.set_train(mode=False)
diff --git a/tests/ut/python/parallel/test_two_matmul.py b/tests/ut/python/parallel/test_two_matmul.py
index dbef5e7e71..13460f5828 100644
--- a/tests/ut/python/parallel/test_two_matmul.py
+++ b/tests/ut/python/parallel/test_two_matmul.py
@@ -49,6 +49,7 @@ class GradWrap(nn.Cell):
 
 def compile_net(net, x, y, b):
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
diff --git a/tests/ut/python/parallel/test_two_weights_parameter.py b/tests/ut/python/parallel/test_two_weights_parameter.py
index 50d4fb17b0..cebee4ce80 100644
--- a/tests/ut/python/parallel/test_two_weights_parameter.py
+++ b/tests/ut/python/parallel/test_two_weights_parameter.py
@@ -80,4 +80,5 @@ def test_two_weights_parameter():
     train_net = OneStepCell(net_with_loss)
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     train_net.set_auto_parallel()
+    train_net.set_train()
     _executor.compile(train_net, x, b)
diff --git a/tests/ut/python/parallel/test_virtual_dataset_3_input.py b/tests/ut/python/parallel/test_virtual_dataset_3_input.py
index e9015713ca..e1be9e7868 100644
--- a/tests/ut/python/parallel/test_virtual_dataset_3_input.py
+++ b/tests/ut/python/parallel/test_virtual_dataset_3_input.py
@@ -76,6 +76,7 @@ def test_virtual_dataset_3_input():
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 2048]), dtype=ms.float32)
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)
 
 
@@ -100,6 +101,7 @@ def test_virtualdataset_cell_3_inputs():
     y = Tensor(np.ones([32, 64]), dtype=ms.float32)
     b = Tensor(np.ones([64, 2048]), dtype=ms.float32)
     net.set_auto_parallel()
+    net.set_train()
     _executor.compile(net, x, y, b)