diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 36fd19391b..d10b95427f 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -40,7 +40,7 @@ set(LITE_SRC
 if (SUPPORT_GPU)
     set(LITE_SRC
             ${LITE_SRC}
-            ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
index 215bac2afd..c3dc183323 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
@@ -1,7 +1,10 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+#define C4NUM 4
 
-#define CHECK_IDXConcat2input_NHWC4                                                         \
+// Align in Axis C for concat
+#define CHECK_IDX                                                                           \
   int X = get_global_id(0);                                                                 \
   int Y = get_global_id(1);                                                                 \
   int Z = get_global_id(2);                                                                 \
@@ -10,536 +13,318 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |
   }                                                                                         \
   FLT4 result;
 
-#define DOConcat2inputaxis1_NHWC4                                                                \
-  if (X < input_shape0.y) {                                                                      \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                  \
-  } else {                                                                                       \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y))); \
-  }                                                                                              \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define DOConcat2inputaxis2_NHWC4                                                                  \
-  if (Y < input_shape0.z) {                                                                        \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                    \
-  } else {                                                                                         \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X))); \
-  }                                                                                                \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define DOConcat2inputaxis3_NHWC4                                                                \
-  if (Z < input_shape0.w) {                                                                      \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                  \
-  } else {                                                                                       \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X))); \
-  }                                                                                              \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define CHECK_IDXConcat2input_NC4HW4                                                                                \
-  int X = get_global_id(0);                                                                                         \
-  int Y = get_global_id(1);                                                                                         \
-  int Z = get_global_id(2);                                                                                         \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {                         \
-    return;                                                                                                         \
-  }                                                                                                                 \
-  if (input_shape0.y == 0 || input_shape1.y == 0 || output_shape.y == 0) {                                          \
-    return;                                                                                                         \
-  }                                                                                                                 \
-  int in_postion_x;                                                                                                 \
-  int out_pos_x = (X / output_shape.y) * output_shape.w * output_shape.y + Z * output_shape.y + X % output_shape.y; \
-  FLT4 result;
-
-#define DOConcat2inputaxis1_NC4HW4                                                                                   \
-  if (X < input_shape0.y) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x = ((X - input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y +  \
-                   ((X - input_shape0.y) % input_shape1.y);                                                          \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
-
-#define DOConcat2inputaxis2_NC4HW4                                                                                     \
-  if (Y < input_shape0.z) {                                                                                            \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y;   \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                  \
-  } else {                                                                                                             \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + (X % input_shape1.y); \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z), in_postion_x));                                 \
-  }                                                                                                                    \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
-
-#define DOConcat2inputaxis3_NC4HW4                                                                                   \
-  if (Z < input_shape0.w) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + (Z - input_shape0.w) * input_shape1.y +  \
-                   (X % input_shape1.y);                                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
-
-#define CHECK_IDXConcat3input_NC4HW4                                                                                \
-  int X = get_global_id(0);                                                                                         \
-  int Y = get_global_id(1);                                                                                         \
-  int Z = get_global_id(2);                                                                                         \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {                         \
-    return;                                                                                                         \
-  }                                                                                                                 \
-  if (input_shape0.y == 0 || input_shape1.y == 0 || input_shape2.y == 0 || output_shape.y == 0) {                   \
-    return;                                                                                                         \
-  }                                                                                                                 \
-  int in_postion_x;                                                                                                 \
-  int out_pos_x = (X / output_shape.y) * output_shape.w * output_shape.y + Z * output_shape.y + X % output_shape.y; \
-  FLT4 result;
-
-#define DOConcat3inputaxis1_NC4HW4                                                                                   \
-  if (X < input_shape0.y) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < input_shape0.y + input_shape1.y) {                                                                  \
-    in_postion_x = ((X - input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y +  \
-                   ((X - input_shape0.y) % input_shape1.y);                                                          \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x = ((X - input_shape0.y - input_shape1.y) / input_shape2.y) * input_shape2.w * input_shape2.y +      \
-                   Z * input_shape2.y + ((X - input_shape0.y - input_shape1.y) % input_shape2.y);                    \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
-
-#define DOConcat3inputaxis2_NC4HW4                                                                                     \
-  if (Y < input_shape0.z) {                                                                                            \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y;   \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                  \
-  } else if (Y < input_shape0.z + input_shape1.z) {                                                                    \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + (X % input_shape1.y); \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z), in_postion_x));                                 \
-  } else {                                                                                                             \
-    in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y + Z * input_shape2.y + (X % input_shape2.y); \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z), in_postion_x));                \
-  }                                                                                                                    \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
-
-#define DOConcat3inputaxis3_NC4HW4                                                                                   \
-  if (Z < input_shape0.w) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < input_shape0.w + input_shape1.w) {                                                                  \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + (Z - input_shape0.w) * input_shape1.y +  \
-                   (X % input_shape1.y);                                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y +                                          \
-                   (Z - input_shape0.w - input_shape1.w) * input_shape2.y + (X % input_shape2.y);                    \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
-
-#define CHECK_IDXConcat3input_NHWC4                                                         \
-  int X = get_global_id(0);                                                                 \
-  int Y = get_global_id(1);                                                                 \
-  int Z = get_global_id(2);                                                                 \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \
-    return;                                                                                 \
-  }                                                                                         \
-  FLT4 result;
-
-#define DOConcat3inputaxis1_NHWC4                                                                                 \
-  if (X < input_shape0.y) {                                                                                       \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                   \
-  } else if (X < (input_shape0.y + input_shape1.y)) {                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));                  \
-  } else {                                                                                                        \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); \
-  }                                                                                                               \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define DOConcat3inputaxis2_NHWC4                                                                                   \
-  if (Y < input_shape0.z) {                                                                                         \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                     \
-  } else if (Y < (input_shape0.z + input_shape1.z)) {                                                               \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));                  \
-  } else {                                                                                                          \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X))); \
-  }                                                                                                                 \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define DOConcat3inputaxis3_NHWC4                                                                                 \
-  if (Z < input_shape0.w) {                                                                                       \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                   \
-  } else if (Z < (input_shape0.w + input_shape1.w)) {                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));                  \
-  } else {                                                                                                        \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); \
-  }                                                                                                               \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define CHECK_IDXConcat4input_NHWC4                                                         \
-  int X = get_global_id(0);                                                                 \
-  int Y = get_global_id(1);                                                                 \
-  int Z = get_global_id(2);                                                                 \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \
-    return;                                                                                 \
-  }                                                                                         \
-  FLT4 result;
-
-#define DOConcat4inputaxis1_NHWC4                                                                                 \
-  if (X < input_shape0.y) {                                                                                       \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                   \
-  } else if (X < (input_shape0.y + input_shape1.y)) {                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));                  \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) {                                            \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); \
-  } else {                                                                                                        \
-    result = READ_IMAGE(input3, smp_none,                                                                         \
-                        (int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y)));  \
-  }                                                                                                               \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
-
-#define DOConcat4inputaxis2_NHWC4                                                                                   \
-  if (Y < input_shape0.z) {                                                                                         \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                     \
-  } else if (Y < (input_shape0.z + input_shape1.z)) {                                                               \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));                  \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) {                                              \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X))); \
-  } else {                                                                                                          \
-    result = READ_IMAGE(input3, smp_none,                                                                           \
-                        (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X)));  \
-  }                                                                                                                 \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+// axis = 1
+#define DOConcat2inputaxis1_NHWC4                                              \
+  int IN = X / output_shape.y;                                                 \
+  int IH = X % output_shape.y;                                                 \
+  int boundary0 = input_shape0.y;                                              \
+  int boundary1 = boundary0 + input_shape1.y;                                  \
+  if (IH < boundary0) {                                                        \
+    int coordinate_x = Y * input_shape0.w + Z;                                 \
+    int coordinate_y = IN * input_shape0.y + IH;                               \
+    result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  } else if (IH < boundary1) {                                                 \
+    int coordinate_x = Y * input_shape1.w + Z;                                 \
+    int coordinate_y = IN * input_shape1.y + IH - boundary0;                   \
+    result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat4inputaxis3_NHWC4                                                                                 \
-  if (Z < input_shape0.w) {                                                                                       \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                   \
-  } else if (Z < (input_shape0.w + input_shape1.w)) {                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));                  \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) {                                            \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); \
-  } else {                                                                                                        \
-    result = READ_IMAGE(input3, smp_none,                                                                         \
-                        (int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X)));  \
-  }                                                                                                               \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+#define DOConcat3inputaxis1_NHWC4                                              \
+  DOConcat2inputaxis1_NHWC4;                                                   \
+  int boundary2 = boundary1 + input_shape2.y;                                  \
+  if (IH >= boundary1 && IH < boundary2) {                                     \
+    int coordinate_x = Y * input_shape2.w + Z;                                 \
+    int coordinate_y = IN * input_shape2.y + IH - boundary1;                   \
+    result = READ_IMAGE(input2, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define CHECK_IDXConcat4input_NC4HW4                                                                                \
-  int X = get_global_id(0);                                                                                         \
-  int Y = get_global_id(1);                                                                                         \
-  int Z = get_global_id(2);                                                                                         \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {                         \
-    return;                                                                                                         \
-  }                                                                                                                 \
-  if (input_shape0.y == 0 || input_shape1.y == 0 || input_shape2.y == 0 || input_shape3.y == 0 ||                   \
-      output_shape.y == 0) {                                                                                        \
-    return;                                                                                                         \
-  }                                                                                                                 \
-  int in_postion_x;                                                                                                 \
-  int out_pos_x = (X / output_shape.y) * output_shape.w * output_shape.y + Z * output_shape.y + X % output_shape.y; \
-  FLT4 result;
+#define DOConcat4inputaxis1_NHWC4                                              \
+  DOConcat3inputaxis1_NHWC4;                                                   \
+  int boundary3 = boundary2 + input_shape3.y;                                  \
+  if (IH >= boundary2 && IH < boundary3) {                                     \
+    int coordinate_x = Y * input_shape3.w + Z;                                 \
+    int coordinate_y = IN * input_shape3.y + IH - boundary2;                   \
+    result = READ_IMAGE(input3, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat4inputaxis1_NC4HW4                                                                                   \
-  if (X < input_shape0.y) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < input_shape0.y + input_shape1.y) {                                                                  \
-    in_postion_x = ((X - input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y +  \
-                   ((X - input_shape0.y) % input_shape1.y);                                                          \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < input_shape0.y + input_shape1.y + input_shape2.y) {                                                 \
-    in_postion_x = ((X - input_shape0.y - input_shape1.y) / input_shape2.y) * input_shape2.w * input_shape2.y +      \
-                   Z * input_shape2.y + ((X - input_shape0.y - input_shape1.y) % input_shape2.y);                    \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x =                                                                                                   \
-      ((X - input_shape0.y - input_shape1.y - input_shape2.y) / input_shape3.y) * input_shape3.w * input_shape3.y +  \
-      Z * input_shape3.y + ((X - input_shape0.y - input_shape1.y - input_shape2.y) % input_shape3.y);                \
-    result = READ_IMAGE(input3, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
+#define DOConcat5inputaxis1_NHWC4                                              \
+  DOConcat4inputaxis1_NHWC4;                                                   \
+  int boundary4 = boundary3 + input_shape4.y;                                  \
+  if (IH >= boundary3 && IH < boundary4) {                                     \
+    int coordinate_x = Y * input_shape4.w + Z;                                 \
+    int coordinate_y = IN * input_shape4.y + IH - boundary3;                   \
+    result = READ_IMAGE(input4, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat4inputaxis2_NC4HW4                                                                                     \
-  if (Y < input_shape0.z) {                                                                                            \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y;   \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                  \
-  } else if (Y < input_shape0.z + input_shape1.z) {                                                                    \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + (X % input_shape1.y); \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z), in_postion_x));                                 \
-  } else if (Y < input_shape0.z + input_shape1.z + input_shape2.z) {                                                   \
-    in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y + Z * input_shape2.y + (X % input_shape2.y); \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z), in_postion_x));                \
-  } else {                                                                                                             \
-    in_postion_x = (X / input_shape3.y) * input_shape3.w * input_shape3.y + Z * input_shape3.y + (X % input_shape3.y); \
-    result =                                                                                                           \
-      READ_IMAGE(input3, smp_none, (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z), in_postion_x));      \
-  }                                                                                                                    \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
+#define DOConcat6inputaxis1_NHWC4                                              \
+  DOConcat5inputaxis1_NHWC4;                                                   \
+  int boundary5 = boundary4 + input_shape5.y;                                  \
+  if (IH >= boundary4 && IH < boundary5) {                                     \
+    int coordinate_x = Y * input_shape5.w + Z;                                 \
+    int coordinate_y = IN * input_shape5.y + IH - boundary4;                   \
+    result = READ_IMAGE(input5, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat4inputaxis3_NC4HW4                                                                                   \
-  if (Z < input_shape0.w) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < input_shape0.w + input_shape1.w) {                                                                  \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + (Z - input_shape0.w) * input_shape1.y +  \
-                   (X % input_shape1.y);                                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < input_shape0.w + input_shape1.w + input_shape2.w) {                                                 \
-    in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y +                                          \
-                   (Z - input_shape0.w - input_shape1.w) * input_shape2.y + (X % input_shape2.y);                    \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x = (X / input_shape3.y) * input_shape3.w * input_shape3.y +                                          \
-                   (Z - input_shape0.w - input_shape1.w - input_shape2.w) * input_shape3.y + (X % input_shape3.y);   \
-    result = READ_IMAGE(input3, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
+// axis = 2
+#define DOConcat2inputaxis2_NHWC4                                              \
+  int boundary0 = input_shape0.z;                                              \
+  int boundary1 = boundary0 + input_shape1.z;                                  \
+  if (Y < boundary0) {                                                         \
+    int coordinate_x = Y * input_shape0.w + Z;                                 \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  } else {                                                                     \
+    int coordinate_x = (Y - boundary0) * input_shape1.w + Z;                   \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-__kernel void Concat4input_NC4HW4(__read_only image2d_t input0, __read_only image2d_t input1,
-                                  __read_only image2d_t input2, __read_only image2d_t input3,
-                                  __write_only image2d_t output, int4 input_shape0, int4 input_shape1,
-                                  int4 input_shape2, int4 input_shape3, int4 output_shape, const int axis) {}
+#define DOConcat3inputaxis2_NHWC4                                              \
+  DOConcat2inputaxis2_NHWC4;                                                   \
+  int boundary2 = boundary1 + input_shape2.z;                                  \
+  if (Y >= boundary1 && Y < boundary2) {                                       \
+    int coordinate_x = (Y - boundary1) * input_shape2.w + Z;                   \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input2, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define CHECK_IDXConcat6input_NHWC4                                                         \
-  int X = get_global_id(0);                                                                 \
-  int Y = get_global_id(1);                                                                 \
-  int Z = get_global_id(2);                                                                 \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \
-    return;                                                                                 \
-  }                                                                                         \
-  FLT4 result;
+#define DOConcat4inputaxis2_NHWC4                                              \
+  DOConcat3inputaxis2_NHWC4;                                                   \
+  int boundary3 = boundary2 + input_shape3.z;                                  \
+  if (Y >= boundary2 && Y < boundary3) {                                       \
+    int coordinate_x = (Y - boundary2) * input_shape3.w + Z;                   \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input3, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat6inputaxis1_NHWC4                                                                                 \
-  if (X < input_shape0.y) {                                                                                       \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                   \
-  } else if (X < (input_shape0.y + input_shape1.y)) {                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));                  \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) {                                            \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) {                           \
-    result = READ_IMAGE(input3, smp_none,                                                                         \
-                        (int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y)));  \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) {          \
-    result = READ_IMAGE(                                                                                          \
-      input4, smp_none,                                                                                           \
-      (int2)((Y)*input_shape4.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y)));   \
-  } else {                                                                                                        \
-    result = READ_IMAGE(input5, smp_none,                                                                         \
-                        (int2)((Y)*input_shape5.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y -    \
-                                                        input_shape3.y - input_shape4.y)));                       \
-  }                                                                                                               \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+#define DOConcat5inputaxis2_NHWC4                                              \
+  DOConcat4inputaxis2_NHWC4;                                                   \
+  int boundary4 = boundary3 + input_shape4.z;                                  \
+  if (Y >= boundary3 && Y < boundary4) {                                       \
+    int coordinate_x = (Y - boundary3) * input_shape4.w + Z;                   \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input4, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat6inputaxis2_NHWC4                                                                                      \
-  if (Y < input_shape0.z) {                                                                                            \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                        \
-  } else if (Y < (input_shape0.z + input_shape1.z)) {                                                                  \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));                     \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) {                                                 \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X)));    \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) {                                \
-    result = READ_IMAGE(input3, smp_none,                                                                              \
-                        (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X)));     \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) {               \
-    result = READ_IMAGE(                                                                                               \
-      input4, smp_none,                                                                                                \
-      (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z) * input_shape4.w + Z, (X)));      \
-  } else {                                                                                                             \
-    result = READ_IMAGE(                                                                                               \
-      input5, smp_none,                                                                                                \
-      (int2)(                                                                                                          \
-        (Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z) * input_shape5.w + Z, \
-        (X)));                                                                                                         \
-  }                                                                                                                    \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+#define DOConcat6inputaxis2_NHWC4                                              \
+  DOConcat5inputaxis2_NHWC4;                                                   \
+  int boundary5 = boundary4 + input_shape5.z;                                  \
+  if (Y >= boundary4 && Y < boundary5) {                                       \
+    int coordinate_x = (Y - boundary4) * input_shape5.w + Z;                   \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input5, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat6inputaxis3_NHWC4                                                                                 \
-  if (Z < input_shape0.w) {                                                                                       \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));                                   \
-  } else if (Z < (input_shape0.w + input_shape1.w)) {                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));                  \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) {                                            \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) {                           \
-    result = READ_IMAGE(input3, smp_none,                                                                         \
-                        (int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X)));  \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) {          \
-    result = READ_IMAGE(                                                                                          \
-      input4, smp_none,                                                                                           \
-      (int2)((Y)*input_shape4.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w, (X)));   \
-  } else {                                                                                                        \
-    result = READ_IMAGE(input5, smp_none,                                                                         \
-                        (int2)((Y)*input_shape5.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -        \
-                                 input_shape3.w - input_shape4.w,                                                 \
-                               (X)));                                                                             \
-  }                                                                                                               \
-  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+// axis = 3
+#define DOConcat2inputaxis3_NHWC4                                              \
+  int boundary0 = input_shape0.w;                                              \
+  int boundary1 = boundary0 + input_shape1.w;                                  \
+  if (Z < boundary0) {                                                         \
+    int coordinate_x = Y * input_shape0.w + Z;                                 \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  } else {                                                                     \
+    int coordinate_x = Y * input_shape1.w + Z - boundary0;                     \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define CHECK_IDXConcat6input_NC4HW4                                                              \
-  int X = get_global_id(0);                                                                       \
-  int Y = get_global_id(1);                                                                       \
-  int Z = get_global_id(2);                                                                       \
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {       \
-    return;                                                                                       \
-  }                                                                                               \
-  if (input_shape0.y == 0 || input_shape1.y == 0 || input_shape2.y == 0 || input_shape3.y == 0 || \
-      input_shape4.y == 0 || input_shape5.y == 0 || output_shape.y == 0) {                        \
-    return;                                                                                       \
-  }                                                                                               \
-  int in_postion_x;                                                                               \
-  FLT4 result;                                                                                    \
-  int out_pos_x = (X / output_shape.y) * output_shape.w * output_shape.y + Z * output_shape.y + X % output_shape.y;
+#define DOConcat3inputaxis3_NHWC4                                              \
+  DOConcat2inputaxis3_NHWC4;                                                   \
+  int boundary2 = boundary1 + input_shape2.w;                                  \
+  if (Z >= boundary1 && Z < boundary2) {                                       \
+    int coordinate_x = Y * input_shape2.w + Z - boundary1;                     \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input2, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat6inputaxis1_NC4HW4                                                                                   \
-  if (X < input_shape0.y) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < (input_shape0.y + input_shape1.y)) {                                                                \
-    in_postion_x = ((X - input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y +  \
-                   ((X - input_shape0.y) % input_shape1.y);                                                          \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) {                                               \
-    in_postion_x = ((X - input_shape0.y - input_shape1.y) / input_shape2.y) * input_shape2.w * input_shape2.y +      \
-                   Z * input_shape2.y + ((X - input_shape0.y - input_shape1.y) % input_shape2.y);                    \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) {                              \
-    in_postion_x =                                                                                                   \
-      ((X - input_shape0.y - input_shape1.y - input_shape2.y) / input_shape3.y) * input_shape3.w * input_shape3.y +  \
-      Z * input_shape3.y + ((X - input_shape0.y - input_shape1.y - input_shape2.y) % input_shape3.y);                \
-    result = READ_IMAGE(input3, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) {             \
-    in_postion_x = ((X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y) / input_shape4.y) *      \
-                     input_shape4.w * input_shape4.y +                                                               \
-                   Z * input_shape4.y +                                                                              \
-                   ((X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y) % input_shape4.y);       \
-    result = READ_IMAGE(input4, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x =                                                                                                   \
-      ((X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y - input_shape4.y) / input_shape5.y) *  \
-        input_shape5.w * input_shape5.y +                                                                            \
-      Z * input_shape5.y +                                                                                           \
-      ((X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y - input_shape4.y) % input_shape5.y);   \
-    result = READ_IMAGE(input5, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
+#define DOConcat4inputaxis3_NHWC4                                              \
+  DOConcat3inputaxis3_NHWC4;                                                   \
+  int boundary3 = boundary2 + input_shape3.w;                                  \
+  if (Z >= boundary2 && Z < boundary3) {                                       \
+    int coordinate_x = Y * input_shape3.w + Z - boundary2;                     \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input3, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat6inputaxis2_NC4HW4                                                                                     \
-  if (Y < input_shape0.z) {                                                                                            \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y;   \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                  \
-  } else if (Y < (input_shape0.z + input_shape1.z)) {                                                                  \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + (X % input_shape1.y); \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z), in_postion_x));                                 \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) {                                                 \
-    in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y + Z * input_shape2.y + (X % input_shape2.y); \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z), in_postion_x));                \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) {                                \
-    in_postion_x = (X / input_shape3.y) * input_shape3.w * input_shape3.y + Z * input_shape3.y + (X % input_shape3.y); \
-    result =                                                                                                           \
-      READ_IMAGE(input3, smp_none, (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z), in_postion_x));      \
-  } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) {               \
-    in_postion_x = (X / input_shape4.y) * input_shape4.w * input_shape4.y + Z * input_shape4.y + (X % input_shape4.y); \
-    result =                                                                                                           \
-      READ_IMAGE(input4, smp_none,                                                                                     \
-                 (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z), in_postion_x));       \
-  } else {                                                                                                             \
-    in_postion_x = (X / input_shape5.y) * input_shape5.w * input_shape5.y + Z * input_shape5.y + (X % input_shape5.y); \
-    result = READ_IMAGE(                                                                                               \
-      input5, smp_none,                                                                                                \
-      (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z), in_postion_x)); \
-  }                                                                                                                    \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
+#define DOConcat5inputaxis3_NHWC4                                              \
+  DOConcat4inputaxis3_NHWC4;                                                   \
+  int boundary4 = boundary3 + input_shape4.w;                                  \
+  if (Z >= boundary3 && Z < boundary4) {                                       \
+    int coordinate_x = Y * input_shape4.w + Z - boundary3;                     \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input4, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
-#define DOConcat6inputaxis3_NC4HW4                                                                                   \
-  if (Z < input_shape0.w) {                                                                                          \
-    in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; \
-    result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < (input_shape0.w + input_shape1.w)) {                                                                \
-    in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + (Z - input_shape0.w) * input_shape1.y +  \
-                   (X % input_shape1.y);                                                                             \
-    result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) {                                               \
-    in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y +                                          \
-                   (Z - input_shape0.w - input_shape1.w) * input_shape2.y + (X % input_shape2.y);                    \
-    result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) {                              \
-    in_postion_x = (X / input_shape3.y) * input_shape3.w * input_shape3.y +                                          \
-                   (Z - input_shape0.w - input_shape1.w - input_shape2.w) * input_shape3.y + (X % input_shape3.y);   \
-    result = READ_IMAGE(input3, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) {             \
-    in_postion_x = (X / input_shape4.y) * input_shape4.w * input_shape4.y +                                          \
-                   (Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w) * input_shape4.y +        \
-                   (X % input_shape4.y);                                                                             \
-    result = READ_IMAGE(input4, smp_none, (int2)((Y), in_postion_x));                                                \
-  } else {                                                                                                           \
-    in_postion_x =                                                                                                   \
-      (X / input_shape5.y) * input_shape5.w * input_shape5.y +                                                       \
-      (Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w - input_shape4.w) * input_shape5.y +    \
-      (X % input_shape5.y);                                                                                          \
-    result = READ_IMAGE(input5, smp_none, (int2)((Y), in_postion_x));                                                \
-  }                                                                                                                  \
-  WRITE_IMAGE(output, (int2)((Y), out_pos_x), result);
+#define DOConcat6inputaxis3_NHWC4                                              \
+  DOConcat5inputaxis3_NHWC4;                                                   \
+  int boundary5 = boundary4 + input_shape5.w;                                  \
+  if (Z >= boundary4 && Z < boundary5) {                                       \
+    int coordinate_x = Y * input_shape5.w + Z - boundary4;                     \
+    int coordinate_y = X;                                                      \
+    result = READ_IMAGE(input5, smp_none, (int2)(coordinate_x, coordinate_y)); \
+  }
 
 #define CONCAT6(Inputnum, Axis, ToFormat)                                                                      \
   __kernel void Concat##Inputnum##Axis##ToFormat(                                                              \
     __read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,                  \
     __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,                  \
     __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, int4 input_shape3, \
-    int4 input_shape4, int4 input_shape5, int4 output_shape, const int axis) {                                 \
-    CHECK_IDXConcat6input##ToFormat;                                                                           \
+    int4 input_shape4, int4 input_shape5, int4 output_shape) {                                                 \
+    CHECK_IDX;                                                                                                 \
     DOConcat##Inputnum##Axis##ToFormat;                                                                        \
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);                                          \
+  }
+
+#define CONCAT5(Inputnum, Axis, ToFormat)                                                                         \
+  __kernel void Concat##Inputnum##Axis##ToFormat(                                                                 \
+    __read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,                     \
+    __read_only image2d_t input3, __read_only image2d_t input4, __write_only image2d_t output, int4 input_shape0, \
+    int4 input_shape1, int4 input_shape2, int4 input_shape3, int4 input_shape4, int4 output_shape) {              \
+    CHECK_IDX;                                                                                                    \
+    DOConcat##Inputnum##Axis##ToFormat;                                                                           \
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);                                             \
   }
 
-#define CONCAT4(Inputnum, Axis, ToFormat)                                                              \
-  __kernel void Concat##Inputnum##Axis##ToFormat(                                                      \
-    __read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,          \
-    __read_only image2d_t input3, __write_only image2d_t output, int4 input_shape0, int4 input_shape1, \
-    int4 input_shape2, int4 input_shape3, int4 output_shape, const int axis) {                         \
-    CHECK_IDXConcat4input##ToFormat;                                                                   \
-    DOConcat##Inputnum##Axis##ToFormat;                                                                \
+#define CONCAT4(Inputnum, Axis, ToFormat)                                                                             \
+  __kernel void Concat##Inputnum##Axis##ToFormat(__read_only image2d_t input0, __read_only image2d_t input1,          \
+                                                 __read_only image2d_t input2, __read_only image2d_t input3,          \
+                                                 __write_only image2d_t output, int4 input_shape0, int4 input_shape1, \
+                                                 int4 input_shape2, int4 input_shape3, int4 output_shape) {           \
+    CHECK_IDX                                                                                                         \
+    DOConcat##Inputnum##Axis##ToFormat;                                                                               \
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);                                                 \
   }
 
-#define CONCAT3(Inputnum, Axis, ToFormat)                                                                     \
-  __kernel void Concat##Inputnum##Axis##ToFormat(__read_only image2d_t input0, __read_only image2d_t input1,  \
-                                                 __read_only image2d_t input2, __write_only image2d_t output, \
-                                                 int4 input_shape0, int4 input_shape1, int4 input_shape2,     \
-                                                 int4 output_shape, const int axis) {                         \
-    CHECK_IDXConcat3input##ToFormat;                                                                          \
-    DOConcat##Inputnum##Axis##ToFormat;                                                                       \
+#define CONCAT3(Inputnum, Axis, ToFormat)                                                                        \
+  __kernel void Concat##Inputnum##Axis##ToFormat(                                                                \
+    __read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,                    \
+    __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, int4 output_shape) { \
+    CHECK_IDX                                                                                                    \
+    DOConcat##Inputnum##Axis##ToFormat;                                                                          \
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);                                            \
   }
 
 #define CONCAT2(Inputnum, Axis, ToFormat)                                                                             \
   __kernel void Concat##Inputnum##Axis##ToFormat(__read_only image2d_t input0, __read_only image2d_t input1,          \
                                                  __write_only image2d_t output, int4 input_shape0, int4 input_shape1, \
-                                                 int4 output_shape, const int axis) {                                 \
-    CHECK_IDXConcat2input##ToFormat;                                                                                  \
+                                                 int4 output_shape) {                                                 \
+    CHECK_IDX                                                                                                         \
     DOConcat##Inputnum##Axis##ToFormat;                                                                               \
+    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);                                                 \
   }
 
-// nc4hw4
-CONCAT6(6input, axis1, _NC4HW4)
-CONCAT6(6input, axis2, _NC4HW4)
-CONCAT6(6input, axis3, _NC4HW4)
-CONCAT4(4input, axis1, _NC4HW4)
-CONCAT4(4input, axis2, _NC4HW4)
-CONCAT4(4input, axis3, _NC4HW4)
-CONCAT3(3input, axis1, _NC4HW4)
-CONCAT3(3input, axis2, _NC4HW4)
-CONCAT3(3input, axis3, _NC4HW4)
-CONCAT2(2input, axis1, _NC4HW4)
-CONCAT2(2input, axis2, _NC4HW4)
-CONCAT2(2input, axis3, _NC4HW4)
-
-// nhwc4
+// axis = 1
 CONCAT6(6input, axis1, _NHWC4)
-CONCAT6(6input, axis2, _NHWC4)
-CONCAT6(6input, axis3, _NHWC4)
+CONCAT5(5input, axis1, _NHWC4)
 CONCAT4(4input, axis1, _NHWC4)
-CONCAT4(4input, axis2, _NHWC4)
-CONCAT4(4input, axis3, _NHWC4)
 CONCAT3(3input, axis1, _NHWC4)
-CONCAT3(3input, axis2, _NHWC4)
-CONCAT3(3input, axis3, _NHWC4)
 CONCAT2(2input, axis1, _NHWC4)
+
+// axis = 2
+CONCAT6(6input, axis2, _NHWC4)
+CONCAT5(5input, axis2, _NHWC4)
+CONCAT4(4input, axis2, _NHWC4)
+CONCAT3(3input, axis2, _NHWC4)
 CONCAT2(2input, axis2, _NHWC4)
+
+// axis = 3
+CONCAT6(6input, axis3, _NHWC4)
+CONCAT5(5input, axis3, _NHWC4)
+CONCAT4(4input, axis3, _NHWC4)
+CONCAT3(3input, axis3, _NHWC4)
 CONCAT2(2input, axis3, _NHWC4)
+
+// UnAlign in Axis C for concat
+#define CHECK_IDX_UNALIGN                                                                         \
+  int X = get_global_id(0);                                                                       \
+  int Y = get_global_id(1);                                                                       \
+  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z) {                              \
+    return;                                                                                       \
+  }                                                                                               \
+  int IN = X / output_shape.y, IH = X % output_shape.y;                                           \
+  int IW = Y;                                                                                     \
+  int Align_Shape0 = UP_DIV(input_shape0.w, C4NUM), Align_Shape1 = UP_DIV(input_shape1.w, C4NUM); \
+  int Align_OutShape = output_shape.w;                                                            \
+  int index_output = (IN * output_shape.y + IH) * stride_w + IW * Align_OutShape * C4NUM;
+
+int doconcat(__read_only image2d_t input, __global FLT *output, int Align_Shape, int4 input_shape, int IN, int IH,
+             int Y, int index_output) {
+  int Remainder = input_shape.w % C4NUM;
+  for (int i = 0; i < Align_Shape; ++i) {
+    FLT4 result = READ_IMAGE(input, smp_none, (int2)((Y * Align_Shape + i), (IN * input_shape.y + IH)));
+    FLT result_temp[4] = {result.x, result.y, result.z, result.w};
+    if ((i + 1) * C4NUM <= input_shape.w) {
+      for (int j = 0; j < C4NUM; ++j) {
+        output[index_output++] = result_temp[j];
+      }
+    } else {
+      for (int j = 0; j < Remainder; ++j) {
+        output[index_output++] = result_temp[j];
+      }
+    }
+  }
+  return index_output;
+}
+
+__kernel void ConcatInput2UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
+                                        __global FLT *output, int4 input_shape0, int4 input_shape1, int stride_w,
+                                        int4 output_shape) {
+  CHECK_IDX_UNALIGN;
+  index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output);
+  index_output = doconcat(input1, output, Align_Shape1, input_shape1, IN, IH, Y, index_output);
+}
+
+__kernel void ConcatInput3UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
+                                        __read_only image2d_t input2, __global FLT *output, int4 input_shape0,
+                                        int4 input_shape1, int4 input_shape2, int stride_w, int4 output_shape) {
+  CHECK_IDX_UNALIGN;
+  int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM);
+  index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output);
+  index_output = doconcat(input1, output, Align_Shape1, input_shape1, IN, IH, Y, index_output);
+  index_output = doconcat(input2, output, Align_Shape2, input_shape2, IN, IH, Y, index_output);
+}
+
+__kernel void ConcatInput4UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
+                                        __read_only image2d_t input2, __read_only image2d_t input3,
+                                        __global FLT *output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
+                                        int4 input_shape3, int stride_w, int4 output_shape) {
+  CHECK_IDX_UNALIGN;
+  int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM), Align_Shape3 = UP_DIV(input_shape3.w, C4NUM);
+  index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output);
+  index_output = doconcat(input1, output, Align_Shape1, input_shape1, IN, IH, Y, index_output);
+  index_output = doconcat(input2, output, Align_Shape2, input_shape2, IN, IH, Y, index_output);
+  index_output = doconcat(input3, output, Align_Shape3, input_shape3, IN, IH, Y, index_output);
+}
+
+__kernel void ConcatInput5UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
+                                        __read_only image2d_t input2, __read_only image2d_t input3,
+                                        __read_only image2d_t input4, __global FLT *output, int4 input_shape0,
+                                        int4 input_shape1, int4 input_shape2, int4 input_shape3, int4 input_shape4,
+                                        int stride_w, int4 output_shape) {
+  CHECK_IDX_UNALIGN;
+  int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM), Align_Shape3 = UP_DIV(input_shape3.w, C4NUM);
+  int Align_Shape4 = UP_DIV(input_shape4.w, C4NUM);
+  index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output);
+  index_output = doconcat(input1, output, Align_Shape1, input_shape1, IN, IH, Y, index_output);
+  index_output = doconcat(input2, output, Align_Shape2, input_shape2, IN, IH, Y, index_output);
+  index_output = doconcat(input3, output, Align_Shape3, input_shape3, IN, IH, Y, index_output);
+  index_output = doconcat(input4, output, Align_Shape4, input_shape4, IN, IH, Y, index_output);
+}
+
+__kernel void ConcatInput6UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1,
+                                        __read_only image2d_t input2, __read_only image2d_t input3,
+                                        __read_only image2d_t input4, __read_only image2d_t input5,
+                                        __global FLT *output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
+                                        int4 input_shape3, int4 input_shape4, int4 input_shape5, int stride_w,
+                                        int4 output_shape) {
+  CHECK_IDX_UNALIGN;
+  int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM), Align_Shape3 = UP_DIV(input_shape3.w, C4NUM);
+  int Align_Shape4 = UP_DIV(input_shape4.w, C4NUM), Align_Shape5 = UP_DIV(input_shape5.w, C4NUM);
+  index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output);
+  index_output = doconcat(input1, output, Align_Shape1, input_shape1, IN, IH, Y, index_output);
+  index_output = doconcat(input2, output, Align_Shape2, input_shape2, IN, IH, Y, index_output);
+  index_output = doconcat(input3, output, Align_Shape3, input_shape3, IN, IH, Y, index_output);
+  index_output = doconcat(input4, output, Align_Shape4, input_shape4, IN, IH, Y, index_output);
+  index_output = doconcat(input5, output, Align_Shape5, input_shape5, IN, IH, Y, index_output);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 9c8d3a12b5..14da40ec69 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -64,54 +64,107 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
 }
 
 int ConcatOpenCLKernel::CheckSpecs() {
-  if (in_tensors_[0]->shape().size() != 4) {
-    MS_LOG(ERROR) << " only support dim = 4 ";
-    return RET_ERROR;
-  }
-
   auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
   MS_LOG(DEBUG) << " concat at axis=:  " << param->axis_;
-  if (param->axis_ < 0) {
-    param->axis_ += in_tensors_.front()->shape().size();
+  if (out_tensors_[0]->shape().size() > 4) {
+    MS_LOG(ERROR) << " GPU Unsupported shape.size > 4 "
+                  << "your shape().size()=:  " << out_tensors_[0]->shape().size();
+    return RET_ERROR;
   }
-  if (param->axis_ < 0 || param->axis_ > 3) {
+  axis_ = param->axis_;
+  if (axis_ < 0) {
+    axis_ += in_tensors_.front()->shape().size();
+  }
+  if (axis_ < 0 || axis_ > 3) {
     MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 ";
     return RET_ERROR;
   }
-
+  if (out_tensors_[0]->shape().size() < 4 && op_parameter_->type_ == PrimitiveType_Concat && axis_ != 0) {
+    if (out_tensors_[0]->shape().size() == 2) {
+      axis_ = axis_ + 2;
+    } else if (out_tensors_[0]->shape().size() == 3) {
+      axis_ = axis_ + 1;
+    } else {
+      MS_LOG(ERROR) << " Unsupported axis =:  " << axis_ << "  shape().size()=:  " << out_tensors_[0]->shape().size();
+      return RET_ERROR;
+    }
+  }
   return RET_OK;
 }
 
 void ConcatOpenCLKernel::SetConstArgs() {
-  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
-  auto output_shape = out_tensors_[0]->shape();
-  cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
-  int arg_cn = 2 * in_tensors_.size() + 1;
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
+  GpuTensorInfo img_info(out_tensors_[0]);
+  size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
+  stride_w = img_info.RowPitch() / dtype;
+  cl_int4 output_shape_ = {};
+  for (int i = 0; i < out_tensors_[0]->shape().size(); ++i) {
+    output_shape_.s[i] = out_tensors_[0]->shape()[i];
+  }
+  Broadcast2GpuShape(out_shape_.s, output_shape_.s, out_tensors_[0]->shape().size(), 1);
+  int arg_cn = in_tensors_.size() + 1;
+  if (axis_ == 3 && !Align_) {
+    for (int i = 0; i < in_tensors_.size(); ++i) {
+      cl_int4 temp = {};
+      for (int j = 0; j < in_tensors_[i]->shape().size(); ++j) {
+        temp.s[j] = in_tensors_[i]->shape()[j];
+      }
+      Broadcast2GpuShape(in_shape_.s, temp.s, in_tensors_[i]->shape().size(), 1);
+      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
+    }
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
+  } else {
+    for (int i = 0; i < in_tensors_.size(); ++i) {
+      cl_int4 temp = {};
+      for (int j = 0; j < in_tensors_[i]->shape().size(); ++j) {
+        temp.s[j] = in_tensors_[i]->shape()[j];
+      }
+      Broadcast2GpuShape(in_shape_.s, temp.s, in_tensors_[i]->shape().size(), 1);
+      in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
+      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
+    }
+  }
+  out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
 }
 
 void ConcatOpenCLKernel::SetGlobalLocal() {
-  auto output_shape = out_tensors_[0]->shape();
   const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
-  std::vector<size_t> local = {1, 1, 1};
-  uint32_t OH = output_shape[0] * output_shape[1];
-  uint32_t OW = output_shape[2];
-  uint32_t OC = output_shape[3];
-  std::vector<size_t> global = {OH, OW, OC};
+  if (axis_ == 3 && !Align_) {
+    OH = out_shape_.s[0] * out_shape_.s[1];
+    OW = out_shape_.s[2];
+    global = {OH, OW, 1};
+    local = {1, 1, 1};
+  } else {
+    OH = out_shape_.s[0] * out_shape_.s[1];
+    OW = out_shape_.s[2];
+    OC = out_shape_.s[3];
+    global = {OH, OW, OC};
+    local = {1, 1, 1};
+  }
   ConcatGetWorkGroup(global, &local, max_global[0]);
   OpenCLKernel::AlignGlobalLocal(global, local);
 }
 
 int ConcatOpenCLKernel::Prepare() {
-  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
+  for (int i = 0; i < in_tensors_.size(); ++i) {
+    int length = in_tensors_[0]->shape().size();
+    if (in_tensors_[i]->shape()[length - 1] % C4NUM != 0) {
+      Align_ = false;
+    }
+  }
+  enable_fp16_ = ocl_runtime_->GetFp16Enable();
   std::string kernel_name = "Concat";
-  if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
-    kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_);
+  if (axis_ == 3 && !Align_) {
+    kernel_name += "Input" + std::to_string(in_tensors_.size()) + "UnAlign";
   } else {
-    MS_LOG(ERROR) << " input must be 2 , 3 , 4 or 6";
-    return RET_ERROR;
+    if (2 <= in_tensors_.size() && in_tensors_.size() <= 6) {
+      kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(axis_);
+    } else {
+      MS_LOG(ERROR) << " input must be less than 6 and more than 2 ";
+      return RET_ERROR;
+    }
   }
+
   kernel_name += "_NHWC4";
   MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
   std::string source = concat_source;
@@ -119,27 +172,25 @@ int ConcatOpenCLKernel::Prepare() {
   ocl_runtime_->LoadSource(program_name, source);
   ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
   MS_LOG(DEBUG) << kernel_name << " Init Done!";
-  SetGlobalLocal();
   SetConstArgs();
+  SetGlobalLocal();
   return RET_OK;
 }
 
 int ConcatOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
-  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
-  if (param->axis_ == 0) {
+  if (axis_ == 0) {
     return RunAxis0();
   }
-  if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
+  if (2 <= in_tensors_.size() && in_tensors_.size() <= 6) {
     int arg_cn = 0;
     for (int i = 0; i < in_tensors_.size(); ++i) {
       ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
     }
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-    for (int i = 0; i < in_tensors_.size(); ++i) {
-      cl_int4 temp = {in_tensors_[i]->shape()[0], in_tensors_[i]->shape()[1], in_tensors_[i]->shape()[2],
-                      UP_DIV(in_tensors_[i]->shape()[3], C4NUM)};
-      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp);
+    if (axis_ == 3 && !Align_) {
+      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
+    } else {
+      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
     }
   } else {
     MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
index 626be0013a..f390704f8b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
@@ -39,8 +39,20 @@ class ConcatOpenCLKernel : public OpenCLKernel {
   int Run() override;
 
  private:
-  int RunAxis0();
+  std::vector<size_t> local;
+  uint32_t OH = {1};
+  uint32_t OW = {1};
+  uint32_t OC = {1};
+  std::vector<size_t> global;
+  bool Align_{true};
+  bool enable_fp16_{false};
+  cl_int stride_w{1};
+  cl_int4 in_shape_{};
+  cl_int4 out_shape_{};
+  int axis_{0};
 
+ private:
+  int RunAxis0();
   cl::Kernel kernel_;
 };
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
similarity index 79%
rename from mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
rename to mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
index 3d49806bfd..3ed8bd95e5 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "src/runtime/kernel/opencl/opencl_subgraph.h"
 #include <set>
 #include "src/runtime/opencl/opencl_executor.h"
 #include "src/runtime/kernel/opencl/utils.h"
@@ -26,11 +26,11 @@ using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::lite::opencl::MemType;
 
-SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }
+OpenCLSubGraph::~OpenCLSubGraph() { UnInit(); }
 
-void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
-  const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
-  MemType mem_type) {
+void OpenCLSubGraph::ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors,
+                                                     const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
+                                                     MemType mem_type) {
   for (size_t i = 0; i < in_tensors.size(); ++i) {
     for (auto &jv : in_kernels.at(i)) {
       MS_ASSERT(jv);
@@ -62,10 +62,10 @@ void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
   }
 }
 
-void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
-                                                              const std::vector<kernel::LiteKernel *> &in_kernels,
-                                                              lite::Tensor *new_tensor,
-                                                              kernel::LiteKernel *in_convert_op, MemType mem_type) {
+void OpenCLSubGraph::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
+                                                        const std::vector<kernel::LiteKernel *> &in_kernels,
+                                                        lite::Tensor *new_tensor, kernel::LiteKernel *in_convert_op,
+                                                        MemType mem_type) {
   MS_ASSERT(in_convert_op);
   auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op);
   for (auto &iv : in_kernels) {
@@ -96,11 +96,11 @@ void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor
   }
 }
 
-int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
-                                        const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
-                                        std::vector<lite::Tensor *> *out_tensors,
-                                        std::vector<OpenCLToFormatParameter *> *out_parameters,
-                                        std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) {
+int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
+                                  const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
+                                  std::vector<lite::Tensor *> *out_tensors,
+                                  std::vector<OpenCLToFormatParameter *> *out_parameters,
+                                  std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) {
   MS_ASSERT(out_tensors);
   MS_ASSERT(out_parameters);
   MS_ASSERT(out_convert_ops);
@@ -120,7 +120,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
     auto *new_tensor = new (std::nothrow) lite::Tensor();
     MS_ASSERT(new_tensor);
     if (new_tensor == nullptr) {
-      MS_LOG(ERROR) << "SubGraphOpenCLKernel new tensor failed!";
+      MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
       return RET_ERROR;
     }
     new_tensor->CopyTensor(*in_tensors[i]);
@@ -141,7 +141,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
     auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter)));
     MS_ASSERT(parameter);
     if (parameter == nullptr) {
-      MS_LOG(ERROR) << "SubGraphOpenCLKernel new parameter failed!";
+      MS_LOG(ERROR) << "OpenCLSubGraph new parameter failed!";
       delete new_tensor;
       new_tensor = nullptr;
       return RET_ERROR;
@@ -161,7 +161,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
     }
     MS_ASSERT(in_convert_op);
     if (in_convert_op == nullptr) {
-      MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!";
+      MS_LOG(ERROR) << "OpenCLSubGraph create op failed!";
       delete new_tensor;
       new_tensor = nullptr;
       free(parameter);
@@ -189,7 +189,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
   return RET_OK;
 }
 
-int SubGraphOpenCLKernel::Init() {
+int OpenCLSubGraph::Init() {
   allocator_ = ocl_runtime_->GetAllocator();
   MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size();
   for (const auto tensor : in_tensors_) {
@@ -233,7 +233,7 @@ int SubGraphOpenCLKernel::Init() {
   return RET_OK;
 }
 
-void SubGraphOpenCLKernel::UpdateTensorDataType() {
+void OpenCLSubGraph::UpdateTensorDataType() {
   bool is_fp16 = ocl_runtime_->GetFp16Enable();
   MS_ASSERT(in_tensors_[0]);
   if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
@@ -253,7 +253,7 @@ void SubGraphOpenCLKernel::UpdateTensorDataType() {
   }
 }
 
-int SubGraphOpenCLKernel::MallocTensorWithReuse() {
+int OpenCLSubGraph::MallocTensorWithReuse() {
   int ret;
   kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
   for (auto *kernel : nodes_) {
@@ -297,10 +297,9 @@ int SubGraphOpenCLKernel::MallocTensorWithReuse() {
   return RET_OK;
 }
 
-void SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
-                                                 const std::vector<kernel::LiteKernel *> &in_kernels,
-                                                 std::vector<std::vector<kernel::LiteKernel *>> *out_kernels,
-                                                 bool is_from) {
+void OpenCLSubGraph::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
+                                           const std::vector<kernel::LiteKernel *> &in_kernels,
+                                           std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from) {
   std::vector<std::set<lite::Tensor *>> ksets;
   for (auto jv : in_kernels) {
     MS_ASSERT(jv);
@@ -321,32 +320,24 @@ void SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor
   }
 }
 
-void SubGraphOpenCLKernel::GetInOutNodes() {
-  std::vector<std::set<lite::Tensor *>> ksets_in;
-  std::vector<std::set<lite::Tensor *>> ksets_out;
-  for (auto jv : nodes_) {
-    MS_ASSERT(jv);
-    std::set<lite::Tensor *> kset;
-    kset.insert(jv->in_tensors().begin(), jv->in_tensors().end());
-    ksets_in.emplace_back(kset);
-
-    kset.clear();
-    kset.insert(jv->out_tensors().begin(), jv->out_tensors().end());
-    ksets_out.emplace_back(kset);
-  }
-  for (size_t j = 0; j < nodes_.size(); ++j) {
-    if (std::find_if(in_tensors_.begin(), in_tensors_.end(),
-                     [&ksets_in, &j](lite::Tensor *val) { return ksets_in[j].count(val); }) != in_tensors_.end()) {
-      in_nodes_.emplace_back(nodes_.at(j));
+void OpenCLSubGraph::GetInOutNodes() {
+  for (auto *node : nodes_) {
+    for (auto *tensor : node->in_tensors()) {
+      if (std::find(in_tensors_.begin(), in_tensors_.end(), tensor) != in_tensors_.end()) {
+        in_nodes_.emplace_back(node);
+        break;
+      }
     }
-    if (std::find_if(out_tensors_.begin(), out_tensors_.end(),
-                     [&ksets_out, &j](lite::Tensor *val) { return ksets_out[j].count(val); }) != out_tensors_.end()) {
-      out_nodes_.emplace_back(nodes_.at(j));
+    for (auto *tensor : node->out_tensors()) {
+      if (std::find(out_tensors_.begin(), out_tensors_.end(), tensor) != out_tensors_.end()) {
+        out_nodes_.emplace_back(node);
+        break;
+      }
     }
   }
 }
 
-int SubGraphOpenCLKernel::Prepare() {
+int OpenCLSubGraph::Prepare() {
   executor_ = new (std::nothrow) lite::opencl::OpenCLExecutor();
   if (executor_ == nullptr) {
     MS_LOG(ERROR) << "Create OpenCLExecutor fail";
@@ -361,7 +352,7 @@ int SubGraphOpenCLKernel::Prepare() {
   return RET_OK;
 }
 
-void SubGraphOpenCLKernel::UnInit() {
+void OpenCLSubGraph::UnInit() {
   for (const auto &tensor : in_convert_tensors_) {
     delete tensor;
   }
@@ -379,11 +370,11 @@ void SubGraphOpenCLKernel::UnInit() {
   delete this->executor_;
 }
 
-int SubGraphOpenCLKernel::InferShape() { return RET_OK; }
+int OpenCLSubGraph::InferShape() { return RET_OK; }
 
-int SubGraphOpenCLKernel::ReSize() { return RET_OK; }
+int OpenCLSubGraph::ReSize() { return RET_OK; }
 
-int SubGraphOpenCLKernel::Run() {
+int OpenCLSubGraph::Run() {
   if (executor_ == nullptr) {
     MS_LOG(ERROR) << "executor is nullptr";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
similarity index 85%
rename from mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
rename to mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
index 630273204a..d14e5dbd12 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SUBGRAPH_OPENCL_KENEL_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SUBGRAPH_OPENCL_KENEL_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SUBGRAPH_OPENCL_KERNEL_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SUBGRAPH_OPENCL_KERNEL_H_
 
 #include <set>
 #include <vector>
@@ -25,25 +25,19 @@
 #include "src/sub_graph_kernel.h"
 
 namespace mindspore::kernel {
-struct SubGraphOpenCLParameter {
-  OpParameter op_parameter;
-  int input_size;
-  int output_size;
-};
-
-class SubGraphOpenCLKernel : public SubGraphKernel {
+class OpenCLSubGraph : public SubGraphKernel {
  public:
-  SubGraphOpenCLKernel(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                       const std::vector<kernel::LiteKernel *> &inKernels,
-                       const std::vector<kernel::LiteKernel *> &outKernels,
-                       const std::vector<kernel::LiteKernel *> &nodes, const lite::InnerContext *ctx = nullptr)
+  OpenCLSubGraph(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                 const std::vector<kernel::LiteKernel *> &inKernels,
+                 const std::vector<kernel::LiteKernel *> &outKernels, const std::vector<kernel::LiteKernel *> &nodes,
+                 const lite::InnerContext *ctx = nullptr)
       : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, ctx) {
     ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
     subgraph_type_ = kGpuSubGraph;
     this->name_ = "GpuSubGraph";
     nodes_set_.insert(nodes.begin(), nodes.end());
   }
-  ~SubGraphOpenCLKernel() override;
+  ~OpenCLSubGraph() override;
 
   int PreProcess() override { return mindspore::lite::RET_OK; }
   int PostProcess() override { return mindspore::lite::RET_OK; }
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index d73d3c4e2c..5e72a4c345 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -25,7 +25,7 @@
 #include "src/kernel_registry.h"
 #include "src/sub_graph_kernel.h"
 #if SUPPORT_GPU
-#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #endif
 
@@ -241,7 +241,7 @@ kernel::SubGraphKernel *Scheduler::CreateSubGraphKernel(const std::vector<kernel
   if (type == kernel::kGpuSubGraph) {
 #if SUPPORT_GPU
     auto sub_kernel = new (std::nothrow)
-      kernel::SubGraphOpenCLKernel(input_tensors, output_tensors, input_kernels, output_kernels, kernels, context_);
+      kernel::OpenCLSubGraph(input_tensors, output_tensors, input_kernels, output_kernels, kernels, context_);
     return sub_kernel;
 #else
     return nullptr;
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index 23ee1bb461..90c3aac375 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -83,7 +83,7 @@ if (SUPPORT_GPU)
     set(KERNEL_OP_SRC
             ${KERNEL_OP_SRC}
             ${GPU_KERNEL_OP_SRC}
-            ${LITE_DIR}/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+            ${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/utils.cc
             )
 endif()
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
index 45935df3a9..037aac107e 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/biasadd_tests.cc
@@ -19,12 +19,12 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h"
 
 using mindspore::kernel::BiasAddOpenCLKernel;
 using mindspore::kernel::LiteKernel;
-using mindspore::kernel::SubGraphOpenCLKernel;
+using mindspore::kernel::OpenCLSubGraph;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
@@ -157,7 +157,7 @@ TEST_F(TestBiasAddOpenCL, BiasAddFp32_dim4) {
 
   MS_LOG(INFO) << "initialize sub_graph";
   std::vector<kernel::LiteKernel *> kernels{biasadd_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({input_tensor}, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph({input_tensor}, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
     MS_LOG(ERROR) << "Create sub_graph kernel error.";
     delete input_tensor;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc
index 556d99e17f..f34eaf8055 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc
@@ -19,7 +19,7 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/common/file_utils.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h"
 
 // PrimitiveType_Cast: src/ops/populate/cast_populate.cc
@@ -93,9 +93,9 @@ TEST_F(TestCastSelfOpenCL, Castfp32tofp16) {
   }
   MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{cast_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
+    MS_LOG(INFO) << " new kernel::OpenCLSubGraph failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -179,9 +179,9 @@ TEST_F(TestCastSelfOpenCL, Castfp16tofp32) {
   }
   MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{cast_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
+    MS_LOG(INFO) << " new kernel::OpenCLSubGraph failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
index a2f14be286..279d7699ba 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/common.cc
@@ -17,11 +17,11 @@
 #include <algorithm>
 #include "ut/src/runtime/kernel/opencl/common.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "nnacl/conv_parameter.h"
 
 using mindspore::kernel::LiteKernel;
-using mindspore::kernel::SubGraphOpenCLKernel;
+using mindspore::kernel::OpenCLSubGraph;
 using mindspore::lite::KernelRegistry;
 using mindspore::schema::Format::Format_NHWC;
 
@@ -99,12 +99,12 @@ void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std
   // simulating benchmark:  session_->CompileGraph() -> scheduler.Schedule() -> ConstructSubGraphs()
   MS_LOG(DEBUG) << "create SubGraph";
   std::vector<LiteKernel *> kernels{kernel};
-  auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel(subgraph_inputs, {&output}, kernels, kernels, kernels);
+  auto sub_graph = new (std::nothrow) OpenCLSubGraph(subgraph_inputs, {&output}, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
     return;
   }
 
-  // simulating benchmark:  session_->CompileGraph() -> PrepareKernels() -> SubGraphOpenCLKernel.Prepare()
+  // simulating benchmark:  session_->CompileGraph() -> PrepareKernels() -> OpenCLSubGraph.Prepare()
   MS_LOG(DEBUG) << "call sub_graph->Prepare()";
   EXPECT_TRUE(sub_graph->Prepare() == RET_OK);  // will set Tensor's allocator be OpenCLAllocator
 
@@ -128,7 +128,7 @@ void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std
     memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
   }
 
-  // simulating benchmark:  MarkAccuracy() -> session_->RunGraph() -> executor_->Run() -> SubGraphOpenCLKernel->Run()
+  // simulating benchmark:  MarkAccuracy() -> session_->RunGraph() -> executor_->Run() -> OpenCLSubGraph->Run()
   MS_LOG(DEBUG) << "run SubGraph & compare result";
   EXPECT_TRUE(sub_graph->Run() == RET_OK);  // will call UnmapBuffer() for input
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
index d2ec96c396..4efdbc98b0 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
@@ -44,4 +44,288 @@ TEST_F(TestOpenCL_Concat, input2_axis0) {
   }
 }
 
+TEST_F(TestOpenCL_Concat, input2_axis1_Align) {
+  std::vector<int> input0_shape = {2, 2, 2, 8};
+  std::vector<int> input1_shape = {2, 2, 2, 8};
+  std::vector<int> output_shape = {2, 4, 2, 8};
+  int axis = 1;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41};
+  float output_data[] = {
+    0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74,
+    0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69,
+    0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,
+    0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30,
+    0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25,
+    0.39, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,
+    0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input6_axis1_Align) {
+  std::vector<int> input0_shape = {2, 3, 2, 8};
+  std::vector<int> input1_shape = {2, 3, 2, 8};
+  std::vector<int> input2_shape = {2, 3, 2, 8};
+  std::vector<int> input3_shape = {2, 3, 2, 8};
+  std::vector<int> input4_shape = {2, 3, 2, 8};
+  std::vector<int> input5_shape = {2, 3, 2, 8};
+  std::vector<int> output_shape = {2, 18, 2, 8};
+  int axis = 1;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39};
+
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41};
+
+  float input2_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39};
+
+  float input3_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41};
+
+  float input4_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39};
+
+  float input5_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+                         0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41};
+  float output_data[] = {
+    0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74,
+    0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59,
+    0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,
+    0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23,
+    0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13,
+    0.41, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06,
+    0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,
+    0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+    0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74,
+    0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69,
+    0.13, 0.41, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75,
+    0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30,
+    0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13,
+    0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,
+    0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46,
+    0.69, 0.13, 0.41, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39,
+    0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74,
+    0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69,
+    0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,
+    0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23,
+    0.46, 0.69, 0.13, 0.41, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25,
+    0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06,
+    0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23, 0.46,
+    0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41,
+    0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74,
+    0.23, 0.46, 0.69, 0.13, 0.41, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59,
+    0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75,
+    0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.25, 0.39, 0.5,  0.6,  0.74, 0.23,
+    0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13,
+    0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.5,  0.6,
+    0.74, 0.23, 0.46, 0.69, 0.13, 0.41};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR},
+              {input1_shape, input1_data, VAR},
+              {input2_shape, input2_data, VAR},
+              {input3_shape, input3_data, VAR},
+              {input4_shape, input4_data, VAR},
+              {input5_shape, input5_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input6_axis2_Align) {
+  std::vector<int> input0_shape = {1, 1, 8};
+  std::vector<int> input1_shape = {1, 1, 8};
+  std::vector<int> input2_shape = {1, 1, 8};
+  std::vector<int> input3_shape = {1, 1, 8};
+  std::vector<int> input4_shape = {1, 1, 8};
+  std::vector<int> input5_shape = {1, 1, 8};
+  std::vector<int> output_shape = {1, 1, 48};
+  int axis = 2;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.13, 0.16};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.47, 0.16};
+  float input2_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.16};
+  float input3_data[] = {0.52, 0.63, 0.78, 0.43, 0.56, 0.69, 0.87, 0.16};
+  float input4_data[] = {0.5, 0.6, 0.74, 0.30, 0.9, 0.59, 0.13, 0.16};
+  float input5_data[] = {0.75, 0.06, 0.74, 0.23, 0.46, 0.69, 0.47, 0.16};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.13, 0.16, 0.5,  0.6,  0.74, 0.23,
+                         0.46, 0.69, 0.47, 0.16, 0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.16,
+                         0.52, 0.63, 0.78, 0.43, 0.56, 0.69, 0.87, 0.16, 0.5,  0.6,  0.74, 0.30,
+                         0.9,  0.59, 0.13, 0.16, 0.75, 0.06, 0.74, 0.23, 0.46, 0.69, 0.47, 0.16};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR},
+              {input1_shape, input1_data, VAR},
+              {input2_shape, input2_data, VAR},
+              {input3_shape, input3_data, VAR},
+              {input4_shape, input4_data, VAR},
+              {input5_shape, input5_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input2_axis3_UnAlign) {
+  std::vector<int> input0_shape = {2, 2, 2, 8};
+  std::vector<int> input1_shape = {2, 2, 2, 9};
+  std::vector<int> output_shape = {2, 2, 2, 17};
+  int axis = 3;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39,
+                         0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39};
+  float input1_data[] = {0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69,
+                         0.13, 0.41, 0.52, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52, 0.5,  0.6,  0.74,
+                         0.23, 0.46, 0.69, 0.13, 0.41, 0.52, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+                         0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69,
+                         0.13, 0.41, 0.52, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52};
+  float output_data[] = {
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+    0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.25, 0.39, 0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.41, 0.52,
+  };
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
+             fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input3_axis1_UnAlign) {
+  std::vector<int> input0_shape = {1, 6};
+  std::vector<int> input1_shape = {1, 7};
+  std::vector<int> input2_shape = {1, 8};
+  std::vector<int> output_shape = {1, 21};
+  int axis = 1;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.47};
+  float input2_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.5,  0.6,  0.74, 0.23, 0.46,
+                         0.69, 0.47, 0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}, {input2_shape, input2_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input4_axis3_UnAlign) {
+  std::vector<int> input0_shape = {1, 1, 1, 6};
+  std::vector<int> input1_shape = {1, 1, 1, 7};
+  std::vector<int> input2_shape = {1, 1, 1, 8};
+  std::vector<int> input3_shape = {1, 1, 1, 9};
+  std::vector<int> output_shape = {1, 1, 1, 30};
+  int axis = -1;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.47};
+  float input2_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13};
+  float input3_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.26};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.47, 0.03, 0.37,
+                         0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.26};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR},
+              {input1_shape, input1_data, VAR},
+              {input2_shape, input2_data, VAR},
+              {input3_shape, input3_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input5_axis3_UnAlign) {
+  std::vector<int> input0_shape = {1, 1, 1, 6};
+  std::vector<int> input1_shape = {1, 1, 1, 7};
+  std::vector<int> input2_shape = {1, 1, 1, 8};
+  std::vector<int> input3_shape = {1, 1, 1, 9};
+  std::vector<int> input4_shape = {1, 1, 1, 10};
+  std::vector<int> output_shape = {1, 1, 1, 40};
+  int axis = 3;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.47};
+  float input2_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13};
+  float input3_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.26};
+  float input4_data[] = {0.06, 0.47, 0.74, 0.23, 0.56, 0.69, 0.73, 0.13, 0.96, 0.78};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.47, 0.03,
+                         0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13,
+                         0.13, 0.26, 0.06, 0.47, 0.74, 0.23, 0.56, 0.69, 0.73, 0.13, 0.96, 0.78};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR},
+              {input1_shape, input1_data, VAR},
+              {input2_shape, input2_data, VAR},
+              {input3_shape, input3_data, VAR},
+              {input4_shape, input4_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
+TEST_F(TestOpenCL_Concat, input6_axis3_UnAlign) {
+  std::vector<int> input0_shape = {1, 1, 1, 6};
+  std::vector<int> input1_shape = {1, 1, 1, 7};
+  std::vector<int> input2_shape = {1, 1, 1, 8};
+  std::vector<int> input3_shape = {1, 1, 1, 9};
+  std::vector<int> input4_shape = {1, 1, 1, 10};
+  std::vector<int> input5_shape = {1, 1, 1, 11};
+  std::vector<int> output_shape = {1, 1, 1, 51};
+  int axis = 3;
+  float input0_data[] = {0.75, 0.06, 0.74, 0.30, 0.9, 0.59};
+  float input1_data[] = {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.47};
+  float input2_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13};
+  float input3_data[] = {0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.26};
+  float input4_data[] = {0.06, 0.47, 0.74, 0.23, 0.56, 0.69, 0.73, 0.13, 0.96, 0.78};
+  float input5_data[] = {0.16, 0.77, 0.84, 0.53, 0.36, 0.29, 0.53, 0.23, 0.86, 0.48, 0.36};
+  float output_data[] = {0.75, 0.06, 0.74, 0.30, 0.9,  0.59, 0.5,  0.6,  0.74, 0.23, 0.46, 0.69, 0.47,
+                         0.03, 0.37, 0.74, 0.23, 0.46, 0.69, 0.13, 0.13, 0.03, 0.37, 0.74, 0.23, 0.46,
+                         0.69, 0.13, 0.13, 0.26, 0.06, 0.47, 0.74, 0.23, 0.56, 0.69, 0.73, 0.13, 0.96,
+                         0.78, 0.16, 0.77, 0.84, 0.53, 0.36, 0.29, 0.53, 0.23, 0.86, 0.48, 0.36};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(axis);
+    TestMain({{input0_shape, input0_data, VAR},
+              {input1_shape, input1_data, VAR},
+              {input2_shape, input2_data, VAR},
+              {input3_shape, input3_data, VAR},
+              {input4_shape, input4_data, VAR},
+              {input5_shape, input5_data, VAR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-3 : 1e-9);
+  }
+}
+
 }  // namespace mindspore::lite::opencl::test
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc
index cb9fdec16d..56b5371292 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_tests.cc
@@ -21,7 +21,7 @@ namespace mindspore::lite::opencl::test {
 class TestOpenCL_Conv2D : public CommonTest {};
 
 namespace {
-// PrimitiveType_Concat: src/ops/populate/conv2d_populate.cc
+// PrimitiveType_Conv2D: src/ops/populate/conv2d_populate.cc
 ConvParameter *CreateParameter(const std::string &attr, ActType act_type) {
   auto *param = test::CreateParameter<ConvParameter>(schema::PrimitiveType_Conv2D);
   param->act_type_ = act_type;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc
index e9c7cf4c99..917073ec64 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fill_tests.cc
@@ -18,7 +18,7 @@
 #include "src/common/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h"
 using mindspore::lite::Tensor;
 using mindspore::schema::PrimitiveType_Fill;
@@ -70,9 +70,9 @@ TEST_F(TestFillOpenCLCI, Fp32testfill) {
   fill_kernel->Init();
   MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{fill_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph({&in_tensor1}, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
+    MS_LOG(INFO) << " new kernel::OpenCLSubGraph failed ";
     delete param;
     delete fill_kernel;
     return;
@@ -126,9 +126,9 @@ TEST_F(TestFillOpenCLCI, Fp32testshape) {
   fill_kernel->Init();
   MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{fill_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph({&in_tensor1}, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
+    MS_LOG(INFO) << " new kernel::OpenCLSubGraph failed ";
     delete param;
     delete fill_kernel;
     return;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
index 55652d1fea..e95dc17fdd 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/power_tests.cc
@@ -18,7 +18,7 @@
 #include "src/common/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/power.h"
 
 // PrimitiveType_Power: src/ops/populate/power_populate.cc
@@ -88,9 +88,9 @@ void TEST_MAIN(const T *input_data1, const T *input_data2, const T *expect_data,
 
   MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{power_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
+    MS_LOG(INFO) << " new kernel::OpenCLSubGraph failed ";
     delete param;
     delete power_kernel;
     return;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
index 96707e4fa5..54bd940b49 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/prelu_tests.cc
@@ -20,13 +20,13 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h"
 #include "mindspore/lite/nnacl/prelu_parameter.h"
 
 using mindspore::kernel::LiteKernel;
+using mindspore::kernel::OpenCLSubGraph;
 using mindspore::kernel::PReluOpenCLKernel;
-using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
@@ -150,7 +150,7 @@ TEST_F(TestPReluOpenCL, PReluFp32_dim4) {
 
   MS_LOG(INFO) << "initialize sub_graph";
   std::vector<kernel::LiteKernel *> kernels{prelu_kernel};
-  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({input_tensor}, outputs, kernels, kernels, kernels);
+  auto *sub_graph = new (std::nothrow) kernel::OpenCLSubGraph({input_tensor}, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
     MS_LOG(ERROR) << "Create kernel sub_graph error";
     delete input_tensor;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
index b61c701afb..a5aaf26ca8 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
@@ -19,7 +19,7 @@
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
-#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h"
 
 namespace mindspore::lite::opencl::test {
@@ -69,7 +69,7 @@ TEST_F(TestToFormatOpenCL, ToFormatNHWC2NCHW) {
   inputs[0]->MallocData(allocator);
 
   std::vector<kernel::LiteKernel *> kernels{arith_kernel};
-  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
+  auto pGraph_ptr = std::make_unique<kernel::OpenCLSubGraph>(inputs, outputs, kernels, kernels, kernels);
   auto pGraph = pGraph_ptr.get();
   if (pGraph == nullptr) {
     MS_LOG(ERROR) << "pGraph create error.";