GPU supports p2p nccl interfaces

5 years ago · db3a2d60cb
parent dd86f0234d
commit db3a2d60cb
23 changed files with 637 additions and 221 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -57,7 +57,9 @@ if (ENABLE_GPU)
 	)

    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
-    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_gpu_kernel.cc")
+    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_collective_gpu_kernel.cc")
+    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_send_gpu_kernel.cc")
+    list(REMOVE_ITEM GPU_SRC_LIST "gpu/nccl/nccl_recv_gpu_kernel.cc")

    if (ENABLE_MPI)
        include(ExternalProject)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,48 +14,48 @@
 * limitations under the License.
 */

-#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(
  AllReduce, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  NcclGpuKernel, float)
+  NcclCollectiveGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  AllReduce, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  NcclGpuKernel, half)
+  NcclCollectiveGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(AllReduce,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      NcclGpuKernel, int)
+                      NcclCollectiveGpuKernel, int)

 MS_REG_GPU_KERNEL_ONE(
  AllGather, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  NcclGpuKernel, float)
+  NcclCollectiveGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  AllGather, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  NcclGpuKernel, half)
+  NcclCollectiveGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(AllGather,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      NcclGpuKernel, int)
+                      NcclCollectiveGpuKernel, int)

 MS_REG_GPU_KERNEL_ONE(
  ReduceScatter, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  NcclGpuKernel, float)
+  NcclCollectiveGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  ReduceScatter, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  NcclGpuKernel, half)
+  NcclCollectiveGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(ReduceScatter,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      NcclGpuKernel, int)
+                      NcclCollectiveGpuKernel, int)

 MS_REG_GPU_KERNEL_ONE(
  Broadcast, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  NcclGpuKernel, float)
+  NcclCollectiveGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  Broadcast, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  NcclGpuKernel, half)
+  NcclCollectiveGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(Broadcast,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      NcclGpuKernel, int)
+                      NcclCollectiveGpuKernel, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h
@ -0,0 +1,211 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_COLLECTIVE_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_COLLECTIVE_GPU_KERNEL_H_
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <vector>
+#include <string>
+#include <map>
+#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+enum NcclKernelType {
+  NCCL_ALL_REDUCE = 0,
+  NCCL_ALL_GATHER,
+  NCCL_REDUCE_SCATTER,
+  NCCL_BROADCAST,
+  NCCL_INVALID_TYPE = 255
+};
+const std::map<std::string, NcclKernelType> kNcclTypeMap = {
+  {"AllReduce", NCCL_ALL_REDUCE},
+  {"AllGather", NCCL_ALL_GATHER},
+  {"ReduceScatter", NCCL_REDUCE_SCATTER},
+  {"Broadcast", NCCL_BROADCAST},
+};
+
+template <typename T>
+class NcclCollectiveGpuKernel : public NcclGpuKernel {
+ public:
+  NcclCollectiveGpuKernel()
+      : nccl_kernel_type_(NCCL_INVALID_TYPE),
+        nccl_reduce_type_(ncclSum),
+        input_size_(0),
+        output_size_(0),
+        root_(0),
+        collective_handle_(nullptr),
+        comm_stream_(nullptr) {}
+  ~NcclCollectiveGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+
+    cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
+    switch (nccl_kernel_type_) {
+      case NCCL_ALL_REDUCE: {
+        auto all_reduce_funcptr =
+          reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
+        MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
+        CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
+                                                         nccl_data_type_, nccl_reduce_type_, stream, group_name_),
+                                   "ncclAllReduce failed");
+        break;
+      }
+      case NCCL_ALL_GATHER: {
+        auto all_gather_funcptr =
+          reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather"));
+        MS_EXCEPTION_IF_NULL(all_gather_funcptr);
+        CHECK_NCCL_RET_WITH_EXCEPT(
+          (*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_),
+          "ncclAllGather failed");
+        break;
+      }
+      case NCCL_REDUCE_SCATTER: {
+        auto reduce_scatter_funcptr =
+          reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter"));
+        MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr);
+        CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
+                                                             nccl_data_type_, nccl_reduce_type_, stream, group_name_),
+                                   "ncclReduceScatter failed");
+        break;
+      }
+      case NCCL_BROADCAST: {
+        auto broadcast_funcptr =
+          reinterpret_cast<Broadcast>(dlsym(const_cast<void *>(collective_handle_), "Broadcast"));
+        MS_EXCEPTION_IF_NULL(broadcast_funcptr);
+        for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
+          input_addr = GetDeviceAddress<T>(inputs, i);
+          output_addr = GetDeviceAddress<T>(outputs, i);
+          CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T),
+                                                          nccl_data_type_, root_, stream, group_name_),
+                                     "ncclBroadcast failed");
+        }
+        break;
+      }
+      default: {
+        MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
+      }
+    }
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0));
+    InferCommType(kernel_node);
+
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+    for (size_t i = 0; i < input_num; ++i) {
+      auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
+      size_t size = sizeof(T);
+      for (size_t j = 0; j < shape.size(); j++) {
+        size *= IntToSize(shape[j]);
+      }
+      size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
+      input_size_list_.push_back(aligned_size);
+      input_size_ += aligned_size;
+    }
+    for (size_t i = 0; i < output_num; ++i) {
+      auto shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
+      size_t size = sizeof(T);
+      for (size_t j = 0; j < shape.size(); j++) {
+        size *= IntToSize(shape[j]);
+      }
+      size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
+      output_size_list_.push_back(aligned_size);
+      output_size_ += aligned_size;
+    }
+
+    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
+    MS_LOG(INFO) << AnfAlgo::GetCNodeName(kernel_node) << " for group " << group_name_;
+    auto comm_stream_attr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stream_id");
+    if (comm_stream_attr) {
+      comm_stream_ = reinterpret_cast<cudaStream_t>(GetValue<uintptr_t>(comm_stream_attr));
+      MS_EXCEPTION_IF_NULL(comm_stream_);
+    }
+
+    collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
+    MS_EXCEPTION_IF_NULL(collective_handle_);
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override { return; }
+
+ private:
+  void InferCommType(const CNodePtr &kernel_node) {
+    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+    auto iter = kNcclTypeMap.find(kernel_name);
+    if (iter == kNcclTypeMap.end()) {
+      MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
+    } else {
+      nccl_kernel_type_ = iter->second;
+    }
+
+    auto reduce_op = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrOp);
+    if (reduce_op) {
+      std::string type = GetValue<std::string>(reduce_op);
+      if (type == "sum") {
+        nccl_reduce_type_ = ncclSum;
+      } else if (type == "max") {
+        nccl_reduce_type_ = ncclMax;
+      } else if (type == "min") {
+        nccl_reduce_type_ = ncclMin;
+      } else if (type == "prod") {
+        nccl_reduce_type_ = ncclProd;
+      } else {
+        MS_LOG(EXCEPTION) << "Nccl reduce type " << type << " is not supported.";
+      }
+    }
+
+    auto root_rank = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrRootRank);
+    if (root_rank) {
+      root_ = static_cast<int>(GetValue<int64_t>(root_rank));
+    }
+    return;
+  }
+
+  size_t AlignMemorySize(size_t size) const {
+    if (size == 0) {
+      return COMMUNICATION_MEM_ALIGN_SIZE;
+    }
+    return ((size + COMMUNICATION_MEM_ALIGN_SIZE - 1) / COMMUNICATION_MEM_ALIGN_SIZE) * COMMUNICATION_MEM_ALIGN_SIZE;
+  }
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  NcclKernelType nccl_kernel_type_;
+  ncclRedOp_t nccl_reduce_type_;
+  size_t input_size_;
+  size_t output_size_;
+  int root_;
+  const void *collective_handle_;
+  cudaStream_t comm_stream_;
+
+  static const size_t COMMUNICATION_MEM_ALIGN_SIZE = 16;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_COLLECTIVE_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -18,11 +18,9 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_GPU_KERNEL_H_

 #include <nccl.h>
-#include <dlfcn.h>
-#include <stdint.h>
-#include <vector>
-#include <string>
 #include <map>
+#include <string>
+#include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
@ -30,20 +28,6 @@

 namespace mindspore {
 namespace kernel {
-enum NcclKernelType {
-  NCCL_ALL_REDUCE = 0,
-  NCCL_ALL_GATHER,
-  NCCL_REDUCE_SCATTER,
-  NCCL_BROADCAST,
-  NCCL_INVALID_TYPE = 255
-};
-const std::map<std::string, NcclKernelType> kNcclTypeMap = {
-  {"AllReduce", NCCL_ALL_REDUCE},
-  {"AllGather", NCCL_ALL_GATHER},
-  {"ReduceScatter", NCCL_REDUCE_SCATTER},
-  {"Broadcast", NCCL_BROADCAST},
-};
-
 static std::map<std::string, ncclDataType_t> kNcclDtypeMap = {
  {"kNumberTypeFloat32", ncclFloat}, {"kNumberTypeFloat16", ncclHalf}, {"kNumberTypeInt32", ncclInt}};

@ -53,174 +37,22 @@ typedef ncclResult_t (*AllGather)(const void *, void *, size_t, ncclDataType_t,
 typedef ncclResult_t (*ReduceScatter)(const void *, void *, size_t, ncclDataType_t, ncclRedOp_t, cudaStream_t,
                                      const std::string &);
 typedef ncclResult_t (*Broadcast)(const void *, void *, size_t, ncclDataType_t, int, cudaStream_t, const std::string &);
+typedef ncclResult_t (*Send)(const void *, size_t, ncclDataType_t, int, cudaStream_t, const std::string &);
+typedef ncclResult_t (*Recv)(void *, size_t, ncclDataType_t, int, cudaStream_t, const std::string &);
+typedef ncclResult_t (*GroupStart)();
+typedef ncclResult_t (*GroupEnd)();
+typedef std::vector<int> (*GetGroupRanks)(const std::string &);

-template <typename T>
 class NcclGpuKernel : public GpuKernel {
 public:
-  NcclGpuKernel()
-      : nccl_kernel_type_(NCCL_INVALID_TYPE),
-        nccl_reduce_type_(ncclSum),
-        group_name_(""),
-        input_size_(0),
-        output_size_(0),
-        root_(0),
-        collective_handle_(nullptr),
-        comm_stream_(nullptr) {}
+  NcclGpuKernel() : group_name_(""), nccl_data_type_(ncclHalf) {}
  ~NcclGpuKernel() override = default;

-  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
-              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    T *input_addr = GetDeviceAddress<T>(inputs, 0);
-    T *output_addr = GetDeviceAddress<T>(outputs, 0);
-
-    cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
-    switch (nccl_kernel_type_) {
-      case NCCL_ALL_REDUCE: {
-        auto all_reduce_funcptr =
-          reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
-        MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
-        CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
-                                                         nccl_data_type_, nccl_reduce_type_, stream, group_name_),
-                                   "ncclAllReduce failed");
-        break;
-      }
-      case NCCL_ALL_GATHER: {
-        auto all_gather_funcptr =
-          reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather"));
-        MS_EXCEPTION_IF_NULL(all_gather_funcptr);
-        CHECK_NCCL_RET_WITH_EXCEPT(
-          (*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_),
-          "ncclAllGather failed");
-        break;
-      }
-      case NCCL_REDUCE_SCATTER: {
-        auto reduce_scatter_funcptr =
-          reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter"));
-        MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr);
-        CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
-                                                             nccl_data_type_, nccl_reduce_type_, stream, group_name_),
-                                   "ncclReduceScatter failed");
-        break;
-      }
-      case NCCL_BROADCAST: {
-        auto broadcast_funcptr =
-          reinterpret_cast<Broadcast>(dlsym(const_cast<void *>(collective_handle_), "Broadcast"));
-        MS_EXCEPTION_IF_NULL(broadcast_funcptr);
-        for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
-          input_addr = GetDeviceAddress<T>(inputs, i);
-          output_addr = GetDeviceAddress<T>(outputs, i);
-          CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T),
-                                                          nccl_data_type_, root_, stream, group_name_),
-                                     "ncclBroadcast failed");
-        }
-        break;
-      }
-      default: {
-        MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
-      }
-    }
-    return true;
-  }
-  bool Init(const CNodePtr &kernel_node) override {
-    nccl_data_type_ = kNcclDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
-    InferCommType(kernel_node);
-
-    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-    for (size_t i = 0; i < input_num; ++i) {
-      auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
-      size_t size = sizeof(T);
-      for (size_t j = 0; j < shape.size(); j++) {
-        size *= IntToSize(shape[j]);
-      }
-      size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
-      input_size_list_.push_back(aligned_size);
-      input_size_ += aligned_size;
-    }
-    for (size_t i = 0; i < output_num; ++i) {
-      auto shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
-      size_t size = sizeof(T);
-      for (size_t j = 0; j < shape.size(); j++) {
-        size *= IntToSize(shape[j]);
-      }
-      size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
-      output_size_list_.push_back(aligned_size);
-      output_size_ += aligned_size;
-    }
-
-    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
-    MS_LOG(INFO) << AnfAlgo::GetCNodeName(kernel_node) << " for group " << group_name_;
-    auto comm_stream_attr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stream_id");
-    if (comm_stream_attr) {
-      comm_stream_ = reinterpret_cast<cudaStream_t>(GetValue<uintptr_t>(comm_stream_attr));
-      MS_EXCEPTION_IF_NULL(comm_stream_);
-    }
-
-    collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
-    MS_EXCEPTION_IF_NULL(collective_handle_);
-    return true;
-  }
-
 protected:
-  void InitSizeLists() override { return; }
-
- private:
-  void InferCommType(const CNodePtr &kernel_node) {
-    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
-    auto iter = kNcclTypeMap.find(kernel_name);
-    if (iter == kNcclTypeMap.end()) {
-      MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
-    } else {
-      nccl_kernel_type_ = iter->second;
-    }
-
-    auto reduce_op = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrOp);
-    if (reduce_op) {
-      std::string type = GetValue<std::string>(reduce_op);
-      if (type == "sum") {
-        nccl_reduce_type_ = ncclSum;
-      } else if (type == "max") {
-        nccl_reduce_type_ = ncclMax;
-      } else if (type == "min") {
-        nccl_reduce_type_ = ncclMin;
-      } else if (type == "prod") {
-        nccl_reduce_type_ = ncclProd;
-      } else {
-        MS_LOG(EXCEPTION) << "Nccl reduce type " << type << " is not supported.";
-      }
-    }
-
-    auto root_rank = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrRootRank);
-    if (root_rank) {
-      root_ = static_cast<int>(GetValue<int64_t>(root_rank));
-    }
-    return;
-  }
+  ncclDataType_t nccl_dtype(const TypeId &type_id) { return kNcclDtypeMap[TypeIdLabel(type_id)]; }

-  size_t AlignMemorySize(size_t size) const {
-    if (size == 0) {
-      return COMMUNICATION_MEM_ALIGN_SIZE;
-    }
-    return ((size + COMMUNICATION_MEM_ALIGN_SIZE - 1) / COMMUNICATION_MEM_ALIGN_SIZE) * COMMUNICATION_MEM_ALIGN_SIZE;
-  }
-
-  NcclKernelType nccl_kernel_type_;
-  ncclRedOp_t nccl_reduce_type_;
-  ncclDataType_t nccl_data_type_;
  std::string group_name_;
-  size_t input_size_;
-  size_t output_size_;
-  int root_;
-  std::vector<size_t> input_size_list_;
-  std::vector<size_t> output_size_list_;
-  std::vector<size_t> workspace_size_list_;
-  const void *collective_handle_;
-  cudaStream_t comm_stream_;
-
-  static const size_t COMMUNICATION_MEM_ALIGN_SIZE = 16;
+  ncclDataType_t nccl_data_type_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.cc
@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(Receive, KernelAttr().AddAllSameAttr(true).AddOutputAttr(kNumberTypeFloat32), NcclRecvGpuKernel,
+                      float);
+MS_REG_GPU_KERNEL_ONE(Receive, KernelAttr().AddAllSameAttr(true).AddOutputAttr(kNumberTypeFloat16), NcclRecvGpuKernel,
+                      half);
+MS_REG_GPU_KERNEL_ONE(Receive, KernelAttr().AddAllSameAttr(true).AddOutputAttr(kNumberTypeInt32), NcclRecvGpuKernel,
+                      int);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h
@ -0,0 +1,88 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_RECV_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_RECV_GPU_KERNEL_H_
+
+#include <vector>
+#include <string>
+#include <functional>
+#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class NcclRecvGpuKernel : public NcclGpuKernel {
+ public:
+  NcclRecvGpuKernel() : src_rank_(-1), collective_handle_(nullptr) {}
+  ~NcclRecvGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &outputs,
+              void *stream_ptr) override {
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+    auto nccl_recv_func = reinterpret_cast<Recv>(dlsym(const_cast<void *>(collective_handle_), "Recv"));
+    MS_EXCEPTION_IF_NULL(nccl_recv_func);
+    CHECK_NCCL_RET_WITH_EXCEPT((*nccl_recv_func)(output_addr, output_size_list_[0] / sizeof(T), nccl_data_type_,
+                                                 src_rank_, reinterpret_cast<cudaStream_t>(stream_ptr), group_name_),
+                               "ncclRecv failed");
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    MS_EXCEPTION_IF_NULL(kernel_node);
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 0) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL receive needs 0 input.";
+      return false;
+    }
+    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+    if (output_num != 1) {
+      MS_LOG(ERROR) << "Output number is " << output_num << ", but NCCL receive needs 1 output.";
+      return false;
+    }
+    src_rank_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "src_rank"));
+    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
+    nccl_data_type_ = nccl_dtype(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0));
+
+    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    size_t output_size =
+      std::accumulate(output_shape.begin(), output_shape.end(), sizeof(T), std::multiplies<size_t>());
+    output_size_list_.push_back(output_size);
+    MS_LOG(INFO) << "NcclRecv source rank is " << src_rank_ << ", group name is " << group_name_;
+
+    collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
+    MS_EXCEPTION_IF_NULL(collective_handle_);
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {}
+
+ private:
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int src_rank_;
+  const void *collective_handle_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_RECV_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.cc
@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(
+  Send, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  NcclSendGpuKernel, float);
+MS_REG_GPU_KERNEL_ONE(
+  Send, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  NcclSendGpuKernel, half);
+MS_REG_GPU_KERNEL_ONE(Send,
+                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                      NcclSendGpuKernel, int);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h
@ -0,0 +1,84 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_SEND_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_SEND_GPU_KERNEL_H_
+
+#include <vector>
+#include <string>
+#include <functional>
+#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class NcclSendGpuKernel : public NcclGpuKernel {
+ public:
+  NcclSendGpuKernel() : dest_rank_(-1), collective_handle_(nullptr) {}
+  ~NcclSendGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    auto nccl_send_func = reinterpret_cast<Send>(dlsym(const_cast<void *>(collective_handle_), "Send"));
+    MS_EXCEPTION_IF_NULL(nccl_send_func);
+    CHECK_NCCL_RET_WITH_EXCEPT((*nccl_send_func)(input_addr, input_size_list_[0] / sizeof(T), nccl_data_type_,
+                                                 dest_rank_, reinterpret_cast<cudaStream_t>(stream_ptr), group_name_),
+                               "ncclSend failed");
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    MS_EXCEPTION_IF_NULL(kernel_node);
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 1) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL send needs 1 input.";
+      return false;
+    }
+
+    dest_rank_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "dest_rank"));
+    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
+    nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0));
+    MS_LOG(INFO) << "NcclSend dest rank is " << dest_rank_ << ", group name is " << group_name_;
+
+    auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    size_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), sizeof(T), std::multiplies<size_t>());
+    input_size_list_.push_back(input_size);
+    output_size_list_.push_back(0);
+
+    collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
+    MS_EXCEPTION_IF_NULL(collective_handle_);
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {}
+
+ private:
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+  int dest_rank_;
+  const void *collective_handle_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NCCL_SEND_GPU_KERNEL_H_
--- a/mindspore/ccsrc/frontend/parallel/pipeline_transformer/pipeline_transformer.cc
+++ b/mindspore/ccsrc/frontend/parallel/pipeline_transformer/pipeline_transformer.cc
@ -207,7 +207,7 @@ SendAttr PipelineTransformer::InsertSend(const FuncGraphPtr &graph, const AnfNod
  auto dest_rank = global_rank_ + (user_node_stage - node_stage) * per_stage_rank_num_;
  Attr attr_rank = std::make_pair("dest_rank", MakeValue(dest_rank));
  OperatorAttrs attrs = {attr_tag, attr_rank};
-  auto send_op = CreatOpInstance(attrs, "_Send", "send");
+  auto send_op = CreatOpInstance(attrs, "Send", "send");
  auto send_node = NewValueNode(send_op);
  auto prim = GetValueNode<PrimitivePtr>(send_node);
  auto shape_type_pair = GetShapeType(parameter);
@ -233,7 +233,7 @@ void PipelineTransformer::InsertReceive(const FuncGraphPtr &graph, const AnfNode
  Attr attr_shape = std::make_pair("shape", shape_type_pair.first);
  Attr attr_dtype = std::make_pair("dtype", shape_type_pair.second);
  OperatorAttrs attrs = {attr_tag, attr_rank, attr_shape, attr_dtype};
-  auto recv_op = CreatOpInstance(attrs, "_Receive", "recv");
+  auto recv_op = CreatOpInstance(attrs, "Receive", "recv");
  std::vector<AnfNodePtr> recv_input = {NewValueNode(recv_op), virtual_param_};
  auto recv = graph->NewCNode(recv_input);
  manager_->SetEdge(use_node, index, recv);
--- a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_common.h
+++ b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_common.h
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_COLLECTIVE_COMMON_H_

 #include <nccl.h>
+#include <vector>
 #include <sstream>
 #include "pybind11/pybind11.h"

@ -31,6 +32,7 @@ struct NcclGroupInfo {
  int rank;
  ncclUniqueId unique_id;
  ncclComm_t comm;
+  std::vector<int> group_ranks;
 };
 #define CHECK_RET(expression, result, message)                                                                         \
  {                                                                                                                    \
--- a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_wrapper.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_wrapper.cc
@ -53,3 +53,21 @@ ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count,
                       cudaStream_t stream, const std::string &group) {
  return NCCLWrapper::instance().Broadcast(input_addr, output_addr, count, data_type, root, stream, group);
 }
+
+ncclResult_t Send(const void *send_addr, size_t count, ncclDataType_t data_type, int peer_rank, cudaStream_t stream,
+                  const std::string &group_name) {
+  return NCCLWrapper::instance().Send(send_addr, count, data_type, peer_rank, stream, group_name);
+}
+
+ncclResult_t Recv(void *recv_addr, size_t count, ncclDataType_t data_type, int peer_rank, cudaStream_t stream,
+                  const std::string &group_name) {
+  return NCCLWrapper::instance().Recv(recv_addr, count, data_type, peer_rank, stream, group_name);
+}
+
+ncclResult_t GroupStart() { return NCCLWrapper::instance().GroupStart(); }
+
+ncclResult_t GroupEnd() { return NCCLWrapper::instance().GroupEnd(); }
+
+std::vector<int> GetGroupRanks(const std::string &group_name) {
+  return NCCLWrapper::instance().GetGroupRanks(group_name);
+}
--- a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_wrapper.h
+++ b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_wrapper.h
@ -48,3 +48,10 @@ extern "C" EXPORT_WRAPPER ncclResult_t ReduceScatter(const void *input_addr, voi
 extern "C" EXPORT_WRAPPER ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count,
                                                 ncclDataType_t data_type, int root, cudaStream_t stream,
                                                 const std::string &group);
+extern "C" EXPORT_WRAPPER ncclResult_t Send(const void *send_addr, size_t count, ncclDataType_t data_type,
+                                            int peer_rank, cudaStream_t stream, const std::string &group_name);
+extern "C" EXPORT_WRAPPER ncclResult_t Recv(void *recv_addr, size_t count, ncclDataType_t data_type, int peer_rank,
+                                            cudaStream_t stream, const std::string &group_name);
+extern "C" EXPORT_WRAPPER ncclResult_t GroupStart();
+extern "C" EXPORT_WRAPPER ncclResult_t GroupEnd();
+extern "C" EXPORT_WRAPPER std::vector<int> GetGroupRanks(const std::string &group_name);
--- a/mindspore/ccsrc/runtime/device/gpu/distribution/mpi_wrapper.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/distribution/mpi_wrapper.cc
@ -68,7 +68,7 @@ bool MPIWrapper::CreateCommGroup(const std::string &group_name, const std::vecto
    return false;
  }

-  NcclGroupInfo nccl_group = {static_cast<int>(ranks.size()), group_rank[0], group_unique_id, nullptr};
+  NcclGroupInfo nccl_group = {static_cast<int>(ranks.size()), group_rank[0], group_unique_id, nullptr, ranks};
  NCCLWrapper::instance().AddGroupInfo(group_name, &nccl_group);
  return true;
 }
@ -122,7 +122,11 @@ void MPIWrapper::Init() {
  CHECK_RET(MPI_Bcast(reinterpret_cast<void *>(&unique_id), sizeof(unique_id), MPI_BYTE, 0, MPI_COMM_WORLD),
            MPI_SUCCESS, "Failed to broadcast nccl unique id.");

-  NcclGroupInfo world_group = {rank_size_, rank_id_, unique_id, nullptr};
+  std::vector<int> world_group_ranks = {};
+  for (int global_rank = 0; global_rank < rank_size_; global_rank++) {
+    world_group_ranks.push_back(global_rank);
+  }
+  NcclGroupInfo world_group = {rank_size_, rank_id_, unique_id, nullptr, world_group_ranks};
  NCCLWrapper::instance().AddGroupInfo(NCCL_WORLD_GROUP, &world_group);
  return;
 }
--- a/mindspore/ccsrc/runtime/device/gpu/distribution/nccl_wrapper.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/distribution/nccl_wrapper.cc
@ -14,6 +14,7 @@
 * limitations under the License.
 */

+#include <vector>
 #include "runtime/device/gpu/distribution/nccl_wrapper.h"

 namespace mindspore {
@ -74,6 +75,24 @@ ncclResult_t NCCLWrapper::Broadcast(const void *input_addr, void *output_addr, s
  return ncclBroadcast(input_addr, output_addr, count, data_type, root, group_comm, stream);
 }

+ncclResult_t NCCLWrapper::Send(const void *send_addr, size_t count, ncclDataType_t data_type, int peer_rank,
+                               cudaStream_t stream, const std::string &group_name) {
+  CHECK_RET(group_info_.count(group_name), 1, "Failed to find group info for Send by the group name " + group_name);
+  ncclComm_t group_comm = group_info_[group_name].comm;
+  return ncclSend(send_addr, count, data_type, peer_rank, group_comm, stream);
+}
+
+ncclResult_t NCCLWrapper::Recv(void *recv_addr, size_t count, ncclDataType_t data_type, int peer_rank,
+                               cudaStream_t stream, const std::string &group_name) {
+  CHECK_RET(group_info_.count(group_name), 1, "Failed to find group info for Recv by the group name " + group_name);
+  ncclComm_t group_comm = group_info_[group_name].comm;
+  return ncclRecv(recv_addr, count, data_type, peer_rank, group_comm, stream);
+}
+
+ncclResult_t NCCLWrapper::GroupStart() { return ncclGroupStart(); }
+
+ncclResult_t NCCLWrapper::GroupEnd() { return ncclGroupEnd(); }
+
 void NCCLWrapper::AddGroupInfo(const std::string &group_name, NcclGroupInfo *group) {
  if (comm_init_done_) {
    CHECK_RET(ncclCommInitRank(&(group->comm), group->size, group->unique_id, group->rank), ncclSuccess,
@ -92,6 +111,12 @@ void NCCLWrapper::DestroyGroup(const std::string &group_name) {
  group_info_.erase(group_iter);
  return;
 }
+
+std::vector<int> NCCLWrapper::GetGroupRanks(const std::string &group_name) {
+  CHECK_RET(group_info_.count(group_name), 1,
+            "Failed to find group info for GetGroupRanks by the group name " + group_name);
+  return group_info_[group_name].group_ranks;
+}
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/gpu/distribution/nccl_wrapper.h
+++ b/mindspore/ccsrc/runtime/device/gpu/distribution/nccl_wrapper.h
@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <nccl.h>
 #include <string>
+#include <vector>
 #include <map>
 #include "runtime/device/gpu/distribution/collective_common.h"

@ -34,16 +35,23 @@ class NCCLWrapper {
  static NCCLWrapper &instance();
  ncclUniqueId nccl_unique_id() const;
  void InitNCCLComm();
-  ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t datatype,
-                         ncclRedOp_t op, cudaStream_t stream, const std::string &group_name = NCCL_WORLD_GROUP);
-  ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t datatype,
-                         cudaStream_t stream, const std::string &group_name = NCCL_WORLD_GROUP);
-  ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t datatype,
-                             ncclRedOp_t op, cudaStream_t stream, const std::string &group_name = NCCL_WORLD_GROUP);
-  ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t datatype, int root,
-                         cudaStream_t stream, const std::string &group_name = NCCL_WORLD_GROUP);
+  ncclResult_t AllReduce(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data_type,
+                         ncclRedOp_t op, cudaStream_t stream, const std::string &group_name);
+  ncclResult_t AllGather(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data_type,
+                         cudaStream_t stream, const std::string &group_name);
+  ncclResult_t ReduceScatter(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data_type,
+                             ncclRedOp_t op, cudaStream_t stream, const std::string &group_name);
+  ncclResult_t Broadcast(const void *input_addr, void *output_addr, size_t count, ncclDataType_t data_type, int root,
+                         cudaStream_t stream, const std::string &group_name);
+  ncclResult_t Send(const void *send_addr, size_t count, ncclDataType_t data_type, int peer_rank, cudaStream_t stream,
+                    const std::string &group_name);
+  ncclResult_t Recv(void *recv_addr, size_t count, ncclDataType_t data_type, int peer_rank, cudaStream_t stream,
+                    const std::string &group_name);
+  ncclResult_t GroupStart();
+  ncclResult_t GroupEnd();
  void AddGroupInfo(const std::string &group_name, NcclGroupInfo *group);
  void DestroyGroup(const std::string &group_name);
+  std::vector<int> GetGroupRanks(const std::string &group_name);

 private:
  NCCLWrapper() : comm_init_done_(false) {}
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_stream_assign.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_stream_assign.cc
@ -143,17 +143,17 @@ void InsertStreamSwitchNode(const std::shared_ptr<session::KernelGraph> &kernel_
    size_t recv_node_offset = pair.recv_node_offset;
    CNodePtr send_node = nullptr;
    CNodePtr recv_node = nullptr;
-    // Step 1: generate Send and Recv CNodes.
+    // Step 1: Generate stream Send and Recv CNodes.
    if (stream_switch_type == kAllReduceStreamSwitch) {
      if (!GenSendRecvCNodesForAllReduce(kernel_graph, mock_send_node, mock_recv_node, &send_node, &recv_node)) {
        MS_LOG(EXCEPTION) << "Generating CNodes for send and recv failed. Stream switch type: kAllReduceStreamSwitch";
      }
    }
-    // Step 2: sort send and recv CNodes by offset.
+    // Step 2: Sort send and recv CNodes by offset.
    ordered_stream_switch_nodes.insert({send_node_offset, send_node});
    ordered_stream_switch_nodes.insert({recv_node_offset, recv_node});
  }
-  // Step 3: insert stream switch CNodes into execution kernel list.
+  // Step 3: Insert stream switch CNodes into execution kernel list.
  auto execution_kernels = kernel_graph->execution_order();
  for (auto node = ordered_stream_switch_nodes.rbegin(); node != ordered_stream_switch_nodes.rend(); node++) {
    execution_kernels.insert(execution_kernels.begin() + node->offset, node->cnode);
--- a/mindspore/core/base/core_ops.h
+++ b/mindspore/core/base/core_ops.h
@ -185,7 +185,7 @@ inline const PrimitivePtr kPrimSGD = std::make_shared<Primitive>("SGD");
 inline const PrimitivePtr kPrimMirror = std::make_shared<Primitive>("_MirrorOperator");
 inline const PrimitivePtr kPrimVirtualDiv = std::make_shared<Primitive>("_VirtualDiv");
 inline const PrimitivePtr kPrimVirtualDataset = std::make_shared<Primitive>("_VirtualDataset");
-inline const PrimitivePtr kPrimReceive = std::make_shared<Primitive>("_Receive");
+inline const PrimitivePtr kPrimReceive = std::make_shared<Primitive>("Receive");
 inline const PrimitivePtr kPrimAllReduce = std::make_shared<Primitive>("AllReduce");
 inline const PrimitivePtr kPrimAllSwap = std::make_shared<Primitive>("AllSwap");
 inline const PrimitivePtr kPrimBroadcast = std::make_shared<Primitive>("Broadcast");
--- a/mindspore/ops/_grad/grad_comm_ops.py
+++ b/mindspore/ops/_grad/grad_comm_ops.py
@ -20,7 +20,7 @@ from .. import operations as P
 from ...common.tensor import RowTensor
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
 from ..operations.comm_ops import (AllGather, _HostAllGather, AllReduce, _AlltoAll, Broadcast,
-                                   _GetTensorSlice, _MirrorOperator, ReduceOp, _Send, _Receive,
+                                   _GetTensorSlice, _MirrorOperator, ReduceOp, Send, Receive,
                                   ReduceScatter, _HostReduceScatter, _VirtualDiv, AllSwap)
 from .grad_base import bprop_getters

@ -77,12 +77,12 @@ def get_bprop_all_reduce(self):
    return bprop


-@bprop_getters.register(_Send)
+@bprop_getters.register(Send)
 def get_bprop_send(self):
    """Generate bprop for Send."""
    shape = self.get_attr_dict()["shape"]
    dtype = self.get_attr_dict()["dtype"]
-    send_grad = _Receive(self.sr_tag, self.rank, shape, dtype, self.group)
+    send_grad = Receive(self.sr_tag, self.rank, shape, dtype, self.group)

    def bprop(x, out, dout):
        dx = send_grad()
@ -90,10 +90,10 @@ def get_bprop_send(self):
    return bprop


-@bprop_getters.register(_Receive)
+@bprop_getters.register(Receive)
 def get_bprop_receive(self):
    """Generate bprop for Receive."""
-    receive_grad = _Send(self.tag, self.rank, self.group)
+    receive_grad = Send(self.tag, self.rank, self.group)
    depend = P.Depend()
    cast = P.Cast()

--- a/mindspore/ops/operations/init.py
+++ b/mindspore/ops/operations/init.py
@ -36,7 +36,7 @@ from .array_ops import (Argmax, Argmin, Cast, Concat, Pack, Unpack,
                        Unique, GatherD, Identity, RepeatElements)
 from .comm_ops import (AllGather, AllReduce, _AlltoAll, AllSwap, ReduceScatter, Broadcast,
                       _MirrorOperator, ReduceOp, _VirtualDataset,
-                       _VirtualDiv, _GetTensorSlice, _Send, _Receive,
+                       _VirtualDiv, _GetTensorSlice, Send, Receive,
                       _HostAllGather, _HostReduceScatter)
 from .debug_ops import (ImageSummary, InsertGradientOf, HookBackward, ScalarSummary,
                        TensorSummary, HistogramSummary, Print, Assert)
--- a/mindspore/ops/operations/comm_ops.py
+++ b/mindspore/ops/operations/comm_ops.py
@ -116,7 +116,7 @@ class AllReduce(PrimitiveWithInfer):
        return x_dtype


-class _Send(PrimitiveWithInfer):
+class Send(PrimitiveWithInfer):
    """
    Send tensors from src_rank to the specified dest_rank.

@ -145,7 +145,7 @@ class _Send(PrimitiveWithInfer):
        >>>     def __init__(self):
        >>>         super(Net, self).__init__()
        >>>         self.depend = P.Depend()
-        >>>         self.send = P._Send(st_tag=0, dest_rank=8, group="hccl_world_group")
+        >>>         self.send = P.Send(st_tag=0, dest_rank=8, group="hccl_world_group")
        >>>
        >>>     def construct(self, x):
        >>>         out = self.depend(x, self.send(x))
@ -170,7 +170,7 @@ class _Send(PrimitiveWithInfer):
        return x_dtype


-class _Receive(PrimitiveWithInfer):
+class Receive(PrimitiveWithInfer):
    """
    receive tensors from src_rank.

@ -201,7 +201,7 @@ class _Receive(PrimitiveWithInfer):
        >>> class Net(nn.Cell):
        >>>     def __init__(self):
        >>>         super(Net, self).__init__()
-        >>>         self.recv = P._Receive(st_tag=0, src_rank=0, shape=[2, 8], dtype=np.float32,
+        >>>         self.recv = P.Receive(st_tag=0, src_rank=0, shape=[2, 8], dtype=np.float32,
        >>>                               group="hccl_world_group")
        >>>
        >>>     def construct(self, x):
--- a/tests/st/nccl/test_nccl_all.py
+++ b/tests/st/nccl/test_nccl_all.py
@ -53,3 +53,10 @@ def test_nccl_reduce_scatter_op():
 def test_nccl_broadcast_op():
    return_code = os.system("mpirun -n 8 pytest -s test_nccl_broadcast_op.py")
    assert return_code == 0
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_single
+def test_nccl_send_recv_op():
+    return_code = os.system("mpirun -n 8 pytest -s test_nccl_send_recv_op.py")
+    assert return_code == 0
--- a/tests/st/nccl/test_nccl_all_gather_op.py
+++ b/tests/st/nccl/test_nccl_all_gather_op.py
@ -48,7 +48,7 @@ def test_AllGather():
    for i in range(size - 1):
        tmp = np.ones([1, 1, 3, 3]).astype(np.float32) * 0.01 * (i + 2)
        expect = np.concatenate((expect, tmp))
-    diff = output.asnumpy() - expect
+    diff = np.absolute(output.asnumpy() - expect)
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape
--- a/tests/st/nccl/test_nccl_send_recv_op.py
+++ b/tests/st/nccl/test_nccl_send_recv_op.py
@ -0,0 +1,69 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.initializer import initializer
+from mindspore.common.parameter import Parameter
+from mindspore.communication.management import init, NCCL_WORLD_COMM_GROUP, get_rank, get_group_size
+from mindspore.ops import operations as P
+from mindspore.common import dtype as mstype
+
+context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+
+init()
+rank = get_rank()
+size = get_group_size()
+if size % 2 != 0:
+    raise RuntimeError("Group size should be divided by 2 exactly.")
+x = np.ones([3, 3, 3, 3]).astype(np.float32) * 0.01 * (rank + 1)
+
+
+class SendNet(nn.Cell):
+    def __init__(self):
+        super(SendNet, self).__init__()
+        self.x = Parameter(initializer(Tensor(x), x.shape), name='x')
+        self.depend = P.Depend()
+        self.send = P.Send(sr_tag=0, dest_rank=rank+size//2, group=NCCL_WORLD_COMM_GROUP)
+
+    def construct(self):
+        out = self.depend(self.x, self.send(self.x))
+        return out
+
+class RecvNet(nn.Cell):
+    def __init__(self):
+        super(RecvNet, self).__init__()
+        self.recv = P.Receive(sr_tag=0, src_rank=rank-size//2, shape=[3, 3, 3, 3], dtype=mstype.float32,
+                              group=NCCL_WORLD_COMM_GROUP)
+
+    def construct(self):
+        out = self.recv()
+        return out
+
+def test_send_recv():
+    if rank < size / 2:
+        send_net = SendNet()
+        output = send_net()
+    else:
+        expect_output = np.ones([3, 3, 3, 3]).astype(np.float32) * 0.01 * (rank-size//2 + 1)
+        recv_net = RecvNet()
+        output = recv_net()
+
+        diff = abs(output.asnumpy() - expect_output)
+        error = np.ones(shape=output.shape) * 1.0e-5
+        assert np.all(diff < error)
+        assert expect_output.shape == output.shape