!12412 nlp perf(Pynative): change memory sync mode from synchronous to asynchronous in SyncHostToDevice

From: @zuochuanyong
Reviewed-by: 
Signed-off-by:
pull/12412/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit c74b4d5d73

@ -101,13 +101,23 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind)
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
MS_EXCEPTION_IF_NULL(runtime_instance);
runtime_instance->SetContext();
// Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode
if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) {
auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind);
if (ret_rt_memcpy != RT_ERROR_NONE) {
MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed";
}
} else {
auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast<int32_t>(kind));
if (!ret) {
MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed";
}
}
}
bool FloatToHalfAndSyncHostToDevice(void *dst, size_t dst_size, const void *src, size_t src_size) {
@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
return true;
}
SyncStream();
bool sync_ok = false;
std::vector<size_t> host_shape;
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);

@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() {
MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error.";
return false;
}
FreeAndClearBufferPtrs();
return true;
}
bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
InnerSetContext();
if (stream_ == nullptr) {
MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr";
return false;
}
std::shared_ptr<char[]> buffer(new char[size]());
MS_EXCEPTION_IF_NULL(buffer);
std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
AddBufferPtr(buffer);
if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast<rtMemcpyKind_t>(kind), stream_)) {
MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error.";
return false;
}
return true;
}

@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
const std::vector<CNodePtr> &execution_order) override;
void ClearGlobalIdleMem() override;
bool SyncStream() override;
bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
void SetContext() override;
void CreateContext() override;
void *context() const override { return rt_context_; }

@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime {
protected:
bool SyncStream() override { return true; };
bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; };
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
TypeId type_id) override;

@ -18,7 +18,9 @@
#include <vector>
#include <memory>
#include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "utils/log_adapter.h"
#include "utils/ms_context.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "ir/tensor.h"
#ifdef ENABLE_DEBUGGER
@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId
// nccl kernel input and output device address is aligned, may lead to host size is not equal to device size
MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
bool execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
if (execution_mode != kPynativeMode) {
if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) {
MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
return false;
}
return GPUDeviceManager::GetInstance().SyncStream(stream);
} else {
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id);
MS_EXCEPTION_IF_NULL(runtime_instance);
return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0);
}
}
void GPUDeviceAddress::ClearDeviceMemory() {

@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet;
using mindspore::device::memswap::MemSwapManager;
using mindspore::device::memswap::SwapKind;
static const size_t PARAMETER_OUTPUT_INDEX = 0;
bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
bool GPUKernelRuntime::SyncStream() {
if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) {
MS_LOG(ERROR) << "Call SyncStream error.";
return false;
}
FreeAndClearBufferPtrs();
return true;
}
bool GPUKernelRuntime::Init() {
auto context_ptr = MsContext::GetInstance();
@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
}
} // namespace
bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
std::shared_ptr<char[]> buffer(new char[size]());
MS_EXCEPTION_IF_NULL(buffer);
std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
AddBufferPtr(buffer);
auto &stream = GPUDeviceManager::GetInstance().default_stream();
MS_EXCEPTION_IF_NULL(stream);
auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream);
if (!ret) {
MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
return false;
}
return ret;
}
DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
TypeId type_id) {
return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);

@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime {
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
TypeId type_id) override;
bool SyncStream() override;
bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
private:
GPUKernelRuntime(const GPUKernelRuntime &);

@ -75,6 +75,7 @@ class KernelRuntime {
const std::unordered_set<ValueNodePtr> &value_nodes,
const std::vector<CNodePtr> &execution_order);
virtual bool SyncStream() = 0;
virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0;
virtual void ClearGlobalIdleMem() {}
virtual void CreateContext() {}
virtual void SetContext() {}
@ -101,6 +102,8 @@ class KernelRuntime {
virtual void PreInit() {}
virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
void AddBufferPtr(std::shared_ptr<char[]> ptr) { buffer_ptrs_.push_back(ptr); }
void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); }
protected:
virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@ -149,6 +152,7 @@ class KernelRuntime {
void *stream_ = nullptr;
std::shared_ptr<MemoryManager> mem_manager_{nullptr};
std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_;
std::vector<std::shared_ptr<char[]>> buffer_ptrs_ = {};
};
using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
} // namespace device

Loading…
Cancel
Save