!12412 nlp perf(Pynative): change memory sync mode from synchronous to asynchronous in SyncHostToDevice

From: @zuochuanyong Reviewed-by: Signed-off-by:
4 years ago · c74b4d5d73
parent e73f43aa85 3fa26683ac
commit c74b4d5d73
8 changed files with 82 additions and 9 deletions
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@ -101,13 +101,23 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind)
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  runtime_instance->SetContext();
+
+  // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode
+  if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) {
    auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind);
    if (ret_rt_memcpy != RT_ERROR_NONE) {
      MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed";
    }
+  } else {
+    auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast<int32_t>(kind));
+    if (!ret) {
+      MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed";
+    }
+  }
 }

 bool FloatToHalfAndSyncHostToDevice(void *dst, size_t dst_size, const void *src, size_t src_size) {
@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
  if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
    return true;
  }
-  SyncStream();
+
  bool sync_ok = false;
  std::vector<size_t> host_shape;
  (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() {
    MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error.";
    return false;
  }
+  FreeAndClearBufferPtrs();
+  return true;
+}
+
+bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
+  InnerSetContext();
+  if (stream_ == nullptr) {
+    MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr";
+    return false;
+  }
+
+  std::shared_ptr<char[]> buffer(new char[size]());
+  MS_EXCEPTION_IF_NULL(buffer);
+  std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
+  AddBufferPtr(buffer);
+  if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast<rtMemcpyKind_t>(kind), stream_)) {
+    MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error.";
+    return false;
+  }
  return true;
 }

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
                                 const std::vector<CNodePtr> &execution_order) override;
  void ClearGlobalIdleMem() override;
  bool SyncStream() override;
+  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
  void SetContext() override;
  void CreateContext() override;
  void *context() const override { return rt_context_; }
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime {

 protected:
  bool SyncStream() override { return true; };
+  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; };
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                       TypeId type_id) override;

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@ -18,7 +18,9 @@
 #include <vector>
 #include <memory>
 #include "runtime/device/gpu/gpu_device_manager.h"
+#include "runtime/device/kernel_runtime_manager.h"
 #include "utils/log_adapter.h"
+#include "utils/ms_context.h"
 #include "runtime/device/gpu/gpu_memory_allocator.h"
 #include "ir/tensor.h"
 #ifdef ENABLE_DEBUGGER
@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId
    // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size
    MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_;
  }
+
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  bool execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
+  if (execution_mode != kPynativeMode) {
    if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) {
      MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
      return false;
    }
    return GPUDeviceManager::GetInstance().SyncStream(stream);
+  } else {
+    auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0);
+  }
 }

 void GPUDeviceAddress::ClearDeviceMemory() {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet;
 using mindspore::device::memswap::MemSwapManager;
 using mindspore::device::memswap::SwapKind;
 static const size_t PARAMETER_OUTPUT_INDEX = 0;
-bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
+
+bool GPUKernelRuntime::SyncStream() {
+  if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) {
+    MS_LOG(ERROR) << "Call SyncStream error.";
+    return false;
+  }
+  FreeAndClearBufferPtrs();
+  return true;
+}

 bool GPUKernelRuntime::Init() {
  auto context_ptr = MsContext::GetInstance();
@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
 }
 }  // namespace

+bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
+  std::shared_ptr<char[]> buffer(new char[size]());
+  MS_EXCEPTION_IF_NULL(buffer);
+  std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
+  AddBufferPtr(buffer);
+
+  auto &stream = GPUDeviceManager::GetInstance().default_stream();
+  MS_EXCEPTION_IF_NULL(stream);
+  auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream);
+  if (!ret) {
+    MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
+    return false;
+  }
+  return ret;
+}
+
 DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                       TypeId type_id) {
  return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime {
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                       TypeId type_id) override;
  bool SyncStream() override;
+  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;

 private:
  GPUKernelRuntime(const GPUKernelRuntime &);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -75,6 +75,7 @@ class KernelRuntime {
                                  const std::unordered_set<ValueNodePtr> &value_nodes,
                                  const std::vector<CNodePtr> &execution_order);
  virtual bool SyncStream() = 0;
+  virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0;
  virtual void ClearGlobalIdleMem() {}
  virtual void CreateContext() {}
  virtual void SetContext() {}
@ -101,6 +102,8 @@ class KernelRuntime {

  virtual void PreInit() {}
  virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
+  void AddBufferPtr(std::shared_ptr<char[]> ptr) { buffer_ptrs_.push_back(ptr); }
+  void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); }

 protected:
  virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@ -149,6 +152,7 @@ class KernelRuntime {
  void *stream_ = nullptr;
  std::shared_ptr<MemoryManager> mem_manager_{nullptr};
  std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_;
+  std::vector<std::shared_ptr<char[]>> buffer_ptrs_ = {};
 };
 using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
 }  // namespace device