!6560 implement parallel pack

Merge pull request !6560 from yihuaijie/master
5 years ago · 6f1c603284
parent edd8214459 6066b16838
commit 6f1c603284
12 changed files with 559 additions and 28 deletions
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
@ -199,6 +199,8 @@ class SoftmaxCost : public OperatorCost {
 using SoftmaxCostPtr = std::shared_ptr<SoftmaxCost>;
 using TileCost = SoftmaxCost;
 using TileCostPtr = std::shared_ptr<TileCost>;
+using PackCost = TileCost;
+using PackCostPtr = std::shared_ptr<PackCost>;
 using ConcatCost = TileCost;
 using ConcatCostPtr = std::shared_ptr<ConcatCost>;
 using SplitCost = TileCost;
--- a/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
+++ b/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
@ -178,6 +178,7 @@ REGISTER(EmbeddingLookupInfo);
 REGISTER(TileInfo);
 REGISTER(StridedSliceInfo);
 REGISTER(DropoutInfo);
+REGISTER(PackInfo);
 REGISTER(ConcatInfo);
 REGISTER(SplitInfo);
 }  // namespace parallel
--- a/mindspore/ccsrc/frontend/parallel/node_check.cc
+++ b/mindspore/ccsrc/frontend/parallel/node_check.cc
@ -39,7 +39,6 @@ const std::set<std::string> BLACK_LIST = {TUPLE_GETITEM,
                                          TILE_SHAPE,
                                          TUPLE_DIV,
                                          TUPLE_TO_ARRAY,
-                                          MAKE_LIST,
                                          MAKE_DICT,
                                          MAKE_SLICE,
                                          MAKE_RECORD,
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
@ -41,5 +41,6 @@
 #include "frontend/parallel/ops_info/strided_slice_info.h"
 #include "frontend/parallel/ops_info/concat_info.h"
 #include "frontend/parallel/ops_info/split_info.h"
+#include "frontend/parallel/ops_info/pack_info.h"

 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_HEAD_FILES_H_
--- a/mindspore/ccsrc/frontend/parallel/ops_info/pack_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/pack_info.cc
--- a/mindspore/ccsrc/frontend/parallel/ops_info/pack_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/pack_info.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_PACK_INFO_H_
+#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_PACK_INFO_H_
+
+#include <string>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "ir/value.h"
+#include "frontend/parallel/auto_parallel/operator_costmodel.h"
+#include "frontend/parallel/ops_info/operator_info.h"
+#include "frontend/parallel/strategy.h"
+
+namespace mindspore {
+namespace parallel {
+class PackInfo : public OperatorInfo {
+ public:
+  PackInfo(const std::string &operator_name, const Shapes &inputs_shape, const Shapes &outputs_shape,
+           const PrimitiveAttrs &attrs)
+      : OperatorInfo(operator_name, inputs_shape, outputs_shape, attrs, std::make_shared<PackCost>(false)) {}
+  ~PackInfo() override = default;
+
+  Status Init(const StrategyPtr &strategy) override;
+  Status InitForCostModel(const StrategyPtr &strategy) override;
+  Status GenerateStrategies(int32_t) override;
+  Status SetCostUnderStrategy(const StrategyPtr &) override;
+  void ReComputeBatchSplitFlagList() override;
+
+ protected:
+  Status GetAttrs() override;
+  Status CheckStrategy(const StrategyPtr &strategy) override;
+  Status InferMirrorOps() override;
+  Status InferForwardCommunication() override { return SUCCESS; }
+  Status InferTensorInfo() override;
+  Status InferDevMatrixShape() override;
+  Status InferTensorMap() override;
+
+ private:
+  size_t axis_ = 0;
+};
+
+using PackInfoPtr = std::shared_ptr<PackInfo>;
+}  // namespace parallel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_PACK_INFO_H_
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
@ -116,7 +116,8 @@ bool StepAutoParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &) {
 std::vector<bool> ExtractInputParameterByNode(const CNodePtr &node) {
  std::vector<bool> is_parameter;
  std::vector<AnfNodePtr> node_inputs{node->inputs()};
-  if ((node_inputs.size() == 2) && AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE)) {
+  if ((node_inputs.size() == 2) &&
+      (AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE) || AnfNodeIsPrimitive(node_inputs[1], MAKE_LIST))) {
    node_inputs = node_inputs[1]->cast<CNodePtr>()->inputs();
  }
  for (size_t i = 1; i < node_inputs.size(); ++i) {
@ -193,7 +194,8 @@ std::vector<size_t> ExtractInputTypeLengthByNode(const CNodePtr &node) {
  std::vector<size_t> inputs_type_len;
  std::vector<AnfNodePtr> node_inputs{node->inputs()};

-  if ((node_inputs.size() == 2) && AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE)) {
+  if ((node_inputs.size() == 2) &&
+      (AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE) || AnfNodeIsPrimitive(node_inputs[1], MAKE_LIST))) {
    node_inputs = node_inputs[1]->cast<CNodePtr>()->inputs();
  }

@ -259,7 +261,7 @@ bool IsSplittableOperator(const std::string &op_name) {
    {MATMUL, TRANSPOSE, GELU, TANH, SOFTMAX, SUB, MUL, DIV, RESHAPE, GREATER, LOG_SOFTMAX, ACTIVATION, PRELU,
     FLOORDIV, L2_NORMALIZE, TENSOR_ADD, MAXPOOL, MAXPOOLV2, VIRTUAL_DATA_SET, RELU, ONEHOT, DROPOUT_DO_MASK,
     REDUCE_MAX, REDUCE_MIN, ARGMAXWITHVALUE, ARGMINWITHVALUE, REDUCE_SUM, CONV2D, FUSE_BATCH_NORM, POOLING,
-     MAX_POOL_WITH_ARGMAX, SIMPLE_MEAN, FLATTEN, BATCH_NORM, LAYER_NORM, BIAS_ADD, ASSIGN_SUB, COS, ACOS, EXP,
+     MAX_POOL_WITH_ARGMAX, SIMPLE_MEAN, FLATTEN, BATCH_NORM, LAYER_NORM, BIAS_ADD, ASSIGN_SUB, COS, ACOS, EXP, PACK,
     LOG, REDUCE_MEAN, REAL_DIV, SIGMOID, POW, MAXIMUM, MINIMUM, EQUAL, NOT_EQUAL, LOGICALNOT, GATHERV2, SQRT, CONCAT,
     STRIDEDSLICE, GET_NEXT, CAST, NEG, SQUARE, BATCH_MATMUL, EXPAND_DIMS, SQUEEZE, SPARSE_GATHERV2, TILE, DROPOUT,
     SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, SIGMOID_CROSS_ENTROPY_WITH_LOGITS, SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS,
@ -281,7 +283,7 @@ bool IsAutoParallelCareNode(const CNodePtr &cnode) {
    return false;
  }
  bool bool_result = IsParallelCareNode(cnode) && !IsSplittableOperator(prim->name());
-  if (bool_result && (prim->name() != MAKE_TUPLE)) {
+  if (bool_result && (prim->name() != MAKE_TUPLE) && (prim->name() != MAKE_LIST)) {
    MS_LOG(EXCEPTION) << "Should implementing OperatorInfo for: " << prim->name();
  } else if (prim->name() == CAST) {
    if (cnode->fullname_with_scope().find(OPTIMIZER_SUB_STRING) != std::string::npos) {
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@ -450,7 +450,7 @@ void StepRedistribution(const CNodePtr &node, const OperatorInfoPtr &distribute_
  AnfNodeIndexSet node_set = manager->node_users()[node];
  CNodePtr insert_node_new;

-  if (AnfNodeIsPrimitive(node, MAKE_TUPLE)) {
+  if (AnfNodeIsPrimitive(node, MAKE_TUPLE) || AnfNodeIsPrimitive(node, MAKE_LIST)) {
    MS_LOG(INFO) << "No need to insert redistribution op betweend make_tuple node and the next node";
    return;
  }
@ -851,7 +851,8 @@ void InsertMirrorOps(const MirrorOps &mirror_ops, const CNodePtr &node) {
  FuncGraphManagerPtr manager = func_graph->manager();
  MS_EXCEPTION_IF_NULL(manager);

-  if ((node->inputs().size() == 2) && AnfNodeIsPrimitive(node->input(1), MAKE_TUPLE)) {
+  if ((node->inputs().size() == 2) &&
+      (AnfNodeIsPrimitive(node->input(1), MAKE_TUPLE) || AnfNodeIsPrimitive(node->input(1), MAKE_LIST))) {
    MS_LOG(INFO) << "The mirror for " << GetPrimName(node) << " has handle by make_tuple node";
    return;
  }
@ -1055,7 +1056,7 @@ Shapes GetNodeShape(const AnfNodePtr &node) {
    MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " shape_ptr is nullptr, full name is "
                      << node->fullname_with_scope();
  }
-  auto tuple_shape_ptr = dyn_cast<abstract::TupleShape>(base_shape_ptr);
+  auto tuple_shape_ptr = dyn_cast<abstract::SequeueShape>(base_shape_ptr);
  if (tuple_shape_ptr != nullptr) {
    auto tuple_shape = tuple_shape_ptr->shape();
    for (auto &shape : tuple_shape) {
@ -1436,7 +1437,7 @@ void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes) {
    SetVirtualDatasetStrategy(cnode);
    ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
    PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
-    if (prim->name() == MAKE_TUPLE) {
+    if (prim->name() == MAKE_TUPLE || prim->name() == MAKE_LIST) {
      continue;
    }
    auto attrs = prim->attrs();
@ -2459,9 +2460,9 @@ Status ParallelInit() {
  return SUCCESS;
 }

-void HandleForwardMakeTuple(const std::vector<AnfNodePtr> &all_nodes) {
+void HandleForwardMakeTupleAndMakeList(const std::vector<AnfNodePtr> &all_nodes) {
  for (auto &node : all_nodes) {
-    if (!AnfNodeIsPrimitive(node, MAKE_TUPLE)) {
+    if (!AnfNodeIsPrimitive(node, MAKE_TUPLE) && !AnfNodeIsPrimitive(node, MAKE_LIST)) {
      continue;
    }

@ -2473,25 +2474,28 @@ void HandleForwardMakeTuple(const std::vector<AnfNodePtr> &all_nodes) {

    FuncGraphManagerPtr manager = cnode->func_graph()->manager();
    MS_EXCEPTION_IF_NULL(manager);
-    auto make_tuple_user = manager->node_users()[cnode];
-    if (make_tuple_user.size() != 1) {
-      MS_LOG(EXCEPTION) << "Now the make_tuple's user must be 1, but got " << make_tuple_user.size();
+    std::string op_type = AnfNodeIsPrimitive(node, MAKE_TUPLE) ? MAKE_TUPLE : MAKE_LIST;
+
+    auto make_tuple_list_user = manager->node_users()[cnode];
+    if (make_tuple_list_user.size() != 1) {
+      MS_LOG(EXCEPTION) << "Now the " << op_type << "'s user must be 1, but got " << make_tuple_list_user.size();
    }
-    CNodePtr make_tuple_next_cnode = make_tuple_user.pop().first->cast<CNodePtr>();
-    MS_EXCEPTION_IF_NULL(make_tuple_next_cnode);
+    CNodePtr make_tuple_list_next_cnode = make_tuple_list_user.pop().first->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(make_tuple_list_next_cnode);

-    std::string make_tuple_user_prim_name = GetPrimName(make_tuple_next_cnode);
-    if (!IsParallelCareNode(make_tuple_next_cnode)) {
-      MS_LOG(INFO) << "The make_tuple's user is " << make_tuple_user_prim_name << ", no need to set operator info";
+    std::string make_tuple__list_user_prim_name = GetPrimName(make_tuple_list_next_cnode);
+    if (!IsParallelCareNode(make_tuple_list_next_cnode)) {
+      MS_LOG(INFO) << "The " << op_type << "'s user is " << make_tuple__list_user_prim_name
+                   << ", no need to set operator info";
      continue;
    }
-    if (make_tuple_next_cnode->inputs().size() != 2) {
-      MS_LOG(EXCEPTION) << "Now the make_tuple's user only support 1 input, but got "
-                        << make_tuple_next_cnode->inputs().size() - 1;
+    if (make_tuple_list_next_cnode->inputs().size() != 2) {
+      MS_LOG(EXCEPTION) << "Now the " << op_type << "'s user only support 1 input, but got "
+                        << make_tuple_list_next_cnode->inputs().size() - 1;
    }

-    MS_LOG(INFO) << "Set the make_tuple's operator info, and the op name is " << make_tuple_user_prim_name;
-    OperatorInfoPtr op_info = GetDistributeOperator(make_tuple_next_cnode);
+    MS_LOG(INFO) << "Set the " << op_type << "'s operator info, and the op name is " << make_tuple__list_user_prim_name;
+    OperatorInfoPtr op_info = GetDistributeOperator(make_tuple_list_next_cnode);
    MS_EXCEPTION_IF_NULL(op_info);
    cnode->set_user_data<OperatorInfo>(op_info);
  }
@ -2695,7 +2699,7 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
    ReshapeInit(all_nodes);
  }

-  HandleForwardMakeTuple(all_nodes);
+  HandleForwardMakeTupleAndMakeList(all_nodes);

  // if the input or parameter has multiple users, check whether its split strategies are consistent.
  CheckParameterSplit(all_nodes);
--- a/mindspore/ccsrc/utils/convert_utils_py.cc
+++ b/mindspore/ccsrc/utils/convert_utils_py.cc
@ -348,6 +348,16 @@ AbstractBasePtr PyListDtype2AbstractTensor(const py::object &shape_obj, const py
    }
    auto tuple = std::make_shared<abstract::AbstractTuple>(ptr_list);
    return tuple;
+  } else if (py::isinstance<py::list>(shape_obj) && py::isinstance<py::list>(type_obj)) {
+    py::list shape_list = shape_obj.cast<py::list>();
+    py::list typeid_list = type_obj.cast<py::list>();
+    AbstractBasePtrList ptr_list;
+    for (size_t it = 0; it < shape_list.size(); ++it) {
+      auto tensor_it = PyListDtype2AbstractTensor(shape_list[it], typeid_list[it]);
+      ptr_list.push_back(tensor_it);
+    }
+    auto list = std::make_shared<abstract::AbstractList>(ptr_list);
+    return list;
  } else if (shape_obj.is_none() && type_obj.is_none()) {
    // AbstractNone indicates there is no output for this CNode node.
    auto abstract_none = std::make_shared<abstract::AbstractNone>();
--- a/mindspore/ops/_grad/grad_comm_ops.py
+++ b/mindspore/ops/_grad/grad_comm_ops.py
@ -228,11 +228,19 @@ def get_bprop_virtual_div_operator(self):
            dx = op(dout, cast(F.scalar_to_array(divisor), dtype(dout)))
            return (dx,)

-        dx = ()
-        input_nums = F.tuple_len(dout)
+        if F.issubclass_(F.typeof(dout), mstype.tuple_):
+            dx = ()
+            input_nums = F.tuple_len(dout)
+            for i in range(input_nums):
+                ele_grad = op(dout[i], cast(F.scalar_to_array(divisor), dtype(dout[i])))
+                dx = dx + (ele_grad,)
+            return (dx,)
+
+        dx = []
+        input_nums = F.list_len(dout)
        for i in range(input_nums):
            ele_grad = op(dout[i], cast(F.scalar_to_array(divisor), dtype(dout[i])))
-            dx = dx + (ele_grad,)
+            dx.append(ele_grad)
        return (dx,)
    return bprop

--- a/mindspore/ops/functional.py
+++ b/mindspore/ops/functional.py
@ -92,6 +92,7 @@ dict_getitem = Primitive('dict_getitem')
 dict_setitem = Primitive('dict_setitem')
 tuple_div = Primitive("tuple_div")
 tuple_len = Primitive("tuple_len")
+list_len = Primitive("list_len")
 tuple_reversed = Primitive("tuple_reversed")
 make_range = Primitive("make_range")
 make_tuple = Primitive('make_tuple')
--- a/tests/ut/python/parallel/test_pack.py
+++ b/tests/ut/python/parallel/test_pack.py
@ -0,0 +1,188 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import mindspore as ms
+import mindspore.context as context
+from mindspore import Tensor, Parameter
+import mindspore.nn as nn
+from mindspore.common.api import _executor
+from mindspore.nn import TrainOneStepCell, Momentum
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    def __init__(self, weight1, weight2, axis=0, strategy1=None, strategy2=None, is_parameter=True):
+        super(Net, self).__init__()
+        self.pack = P.Pack(axis=axis).shard(strategy1)
+        self.mul = P.Mul().shard(strategy2)
+        if is_parameter:
+            self.weight1 = Parameter(weight1, "w1")
+        else:
+            self.weight1 = weight1
+        self.weight2 = Parameter(weight2, "w2")
+
+    def construct(self, x):
+        out = self.pack([self.weight1, self.weight2])
+        out = self.mul(x, out)
+        return out
+
+
+class Net1(nn.Cell):
+    def __init__(self, weight1, weight2, axis=0, strategy1=None, strategy2=None):
+        super(Net1, self).__init__()
+        self.pack = P.Pack(axis=axis).shard(strategy1)
+        self.mul = P.Mul().shard(strategy2)
+        self.weight1 = Parameter(weight1, "w1")
+        self.weight2 = Parameter(weight2, "w2")
+
+    def construct(self, x):
+        out = self.mul(x, self.weight1)
+        out = self.pack([out, self.weight2])
+        return out
+
+
+class Net2(nn.Cell):
+    def __init__(self, weight1, weight2, weight3, axis=0, strategy1=None, strategy2=None, is_parameter=True):
+        super(Net2, self).__init__()
+        self.pack = P.Pack(axis=axis).shard(strategy1)
+        self.mul = P.Mul().shard(strategy2)
+        if is_parameter:
+            self.weight1 = Parameter(weight1, "w1")
+        else:
+            self.weight1 = weight1
+        self.weight2 = Parameter(weight2, "w2")
+        self.weight3 = Parameter(weight2, "w3")
+
+    def construct(self, x):
+        out = self.pack([self.weight1, self.weight2, self.weight3])
+        out = self.mul(x, out)
+        return out
+
+
+_w1 = Tensor(np.ones([48, 64]), dtype=ms.float32)
+_w2 = Tensor(np.ones([48, 64]), dtype=ms.float32)
+_w3 = Tensor(np.ones([48, 64]), dtype=ms.float32)
+_x = Tensor(np.ones([2, 48, 64]), dtype=ms.float32)
+_x1 = Tensor(np.ones([48, 64]), dtype=ms.float32)
+_x2 = Tensor(np.ones([3, 48, 64]), dtype=ms.float32)
+
+
+def compile_net(net):
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    train_net = TrainOneStepCell(net, optimizer)
+    train_net.set_auto_parallel()
+    _executor.compile(train_net, _x)
+    context.reset_auto_parallel_context()
+
+
+def compile_net1(net):
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    train_net = TrainOneStepCell(net, optimizer)
+    train_net.set_auto_parallel()
+    _executor.compile(train_net, _x1)
+    context.reset_auto_parallel_context()
+
+
+def compile_net2(net):
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    train_net = TrainOneStepCell(net, optimizer)
+    train_net.set_auto_parallel()
+    _executor.compile(train_net, _x2)
+    context.reset_auto_parallel_context()
+
+
+def test_pack_parameter():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((4, 2), (4, 2))
+    strategy2 = ((1, 4, 2), (1, 4, 2))
+    net = Net(_w1, _w2, 0, strategy1, strategy2)
+    compile_net(net)
+
+
+def test_pack_parameter_no_full_split():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((2, 2), (2, 2))
+    strategy2 = ((1, 4, 2), (1, 4, 2))
+    net = Net(_w1, _w2, 0, strategy1, strategy2)
+    compile_net(net)
+
+
+def test_pack_tensor_and_parameter():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((4, 2), (4, 2))
+    strategy2 = ((1, 4, 2), (1, 4, 2))
+    net = Net(_w1, _w2, 0, strategy1, strategy2, False)
+    compile_net(net)
+
+
+def test_pack_output():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((4, 2), (4, 2))
+    strategy2 = ((4, 2), (4, 2))
+    net = Net1(_w1, _w2, 0, strategy1, strategy2)
+    compile_net1(net)
+
+
+def test_pack_output_axis1():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((4, 2), (4, 2))
+    strategy2 = ((4, 2), (4, 2))
+    net = Net1(_w1, _w2, 1, strategy1, strategy2)
+    compile_net1(net)
+
+
+def test_pack_output_no_full_split():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((2, 2), (2, 2))
+    strategy2 = ((4, 2), (4, 2))
+    net = Net1(_w1, _w2, 0, strategy1, strategy2)
+    compile_net1(net)
+
+
+def test_pack_no_strategy():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = None
+    strategy2 = ((4, 2), (4, 2))
+    net = Net1(_w1, _w2, 0, strategy1, strategy2)
+    compile_net1(net)
+
+
+def test_pack_no_strategy_axis1():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = None
+    strategy2 = ((4, 2), (4, 2))
+    net = Net1(_w1, _w2, 1, strategy1, strategy2)
+    compile_net1(net)
+
+
+def test_pack_auto_parallel():
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=8, global_rank=0)
+    net = Net1(_w1, _w2, 0)
+    compile_net1(net)
+
+
+def test_pack_auto_parallel_axis1():
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=8, global_rank=0)
+    net = Net1(_w1, _w2, 1)
+    compile_net1(net)
+
+
+def test_pack_auto_parallel_3_tensor():
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=8, global_rank=0)
+    net = Net2(_w1, _w2, _w3)
+    compile_net2(net)