refine GPU memory swap performance

pull/3268/head
lizhenyu 5 years ago
parent 28c8a5cc26
commit c67e562373

@ -46,7 +46,7 @@ struct KernelExecutionInfo {
size_t swap_in_task_num_{0};
// Key: output index, value: topo orders of node users
std::map<size_t, std::vector<size_t>> node_users_map_;
// Key: output idx, value: (host addr, dirty or not)
// Key: output index, value: pair (host addr, dirty or not)
std::map<size_t, std::pair<HostAddress, bool>> host_addrs_;
KernelExecutionInfo() {}
@ -105,7 +105,12 @@ class MemCopyManager {
virtual void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {}
virtual void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {}
virtual void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr, bool profiling,
float *cost_time) {}
virtual void AddMemSwapOutTaskMock(const DeviceAddressPtr &device_address) {}
virtual void AddMemSwapInTaskMock(const DeviceAddressPtr &device_address) {}
virtual bool SyncMemCopyStream(SwapKind swap_kind) { return true; }
@ -113,11 +118,17 @@ class MemCopyManager {
virtual DeviceAddressPtr UpdateSwapInQueue() { return nullptr; }
virtual DeviceAddressPtr UpdateSwapOutQueueMock() { return nullptr; }
virtual DeviceAddressPtr UpdateSwapInQueueMock() { return nullptr; }
virtual bool AllocHostPinnedMem(size_t size, void **addr) const { return true; }
virtual void FreeHostPinnedMem(void *addr) const {}
virtual void ClearSwapQueue() {}
virtual void ClearSwapQueueMock() {}
};
using MemCopyManagerPtr = std::shared_ptr<MemCopyManager>;
using MemSwapInfoSet = std::set<MemSwapInfo, SwapInfoComp>;

@ -48,12 +48,12 @@ class MemSwapManager {
bool Init(const mindspore::session::KernelGraph *kernel_graph, size_t swap_mem_size = 0);
void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address,
const HostAddress &host_address) const;
void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address, const HostAddress &host_address,
bool mock, bool profiling = false, float *cost_time = nullptr) const;
bool SyncMemCopyStream(SwapKind swap_kind) const;
DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind) const;
DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind, bool mock) const;
bool RetreatSwapInfo();
@ -63,8 +63,6 @@ class MemSwapManager {
bool mem_swap_init() const { return mem_swap_initialized_; }
KernelExecutionInfo &SearchKernelExecutionInfo(const AnfNodePtr &kernel) const;
void AddKernelExecutionPerform(const AnfNodePtr &kernel, float perform);
float QueryKernelExecutionPerform(const AnfNodePtr &kernel) const;
@ -79,7 +77,9 @@ class MemSwapManager {
size_t QueryKernelTriggerSwapInTaskNum(const AnfNodePtr &kernel) const;
const AnfNodePtr QueryKerneByTopoOrder(size_t index) const;
const AnfNodePtr QueryKernelByTopoOrder(size_t index) const;
size_t QueryKernelTopoOrder(const AnfNodePtr &kernel) const;
const MemSwapInfoSet &QueryKernelMemSwapInfo(const AnfNodePtr &kernel) const;
@ -93,17 +93,19 @@ class MemSwapManager {
void ResetHostAddrIsDirty();
void InsertSwapInBlackList(const void *device_ptr);
bool FindInSwapInBlackList(const void *device_ptr) const;
bool AllocHostPinnedMem(size_t size, void **addr) const;
void ReleaseHostPinnedMem();
void ClearSwapQueue() const;
void ClearSwapQueue(bool mock) const;
void DumpSwapInfo() const;
void DumpUserNodes() const;
private:
KernelExecutionInfo &SearchKernelExecutionInfo(const AnfNodePtr &kernel) const;
void AddSwapInfo();
void ResetSwapInfo();
@ -130,6 +132,8 @@ class MemSwapManager {
bool CheckDistanceBetweenKernels(const TensorInfo &tensor_info) const;
std::vector<std::pair<size_t, size_t>> CheckDistanceBetweenKernelsWithIdx(const TensorInfo &tensor_info) const;
bool IsCommunicationRelevantOp(const AnfNodePtr &kernel) const;
std::vector<CNodePtr> execution_order_;
@ -139,7 +143,6 @@ class MemSwapManager {
// Key: trigger swap kernel, value: MemSwapInfoSet of kernel need to be swapped
std::unordered_map<void *, MemSwapInfoSet> mem_swap_info_map_;
std::vector<HostAddress> host_addrs_list_;
std::unordered_set<const void *> swap_in_blacklist_;
// Key: cache kernel address, value: lists of first time move pos or not
std::map<void *, std::vector<bool>> kernel_first_move_cache_map_;

File diff suppressed because it is too large Load Diff

@ -20,6 +20,7 @@
#include <string>
#include <memory>
#include <vector>
#include <set>
#include <utility>
#include <unordered_map>
#include "runtime/device/kernel_runtime.h"
@ -55,23 +56,27 @@ class GPUKernelRuntime : public KernelRuntime {
void InitKernelOutputAddress(const session::KernelGraph *graph);
void InitKernelWorkspaceAddress(const session::KernelGraph *graph);
void InitMemorySwapInfo(const session::KernelGraph *graph);
void SaveGraphOutputNode(const session::KernelGraph *graph);
bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const;
void ClearKernelOutputAddress(const session::KernelGraph *graph);
void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
bool RunOneStep(const session::KernelGraph *graph);
bool SearchMemSwapScheme(const session::KernelGraph *graph);
bool RefineMemSwapScheme(const session::KernelGraph *graph);
bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false);
void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
const AddressPtrList &workspace, const AddressPtrList &outputs);
bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size);
bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);
bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces,
AddressPtrList *kernel_outputs);
bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs);
AddressPtrList *kernel_outputs, bool mock);
bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock);
bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
AddressPtrList *kernel_outputs);
AddressPtrList *kernel_outputs, bool mock);
bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod,
const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces);
const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces,
bool mock);
void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph);
void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel);
void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel);
@ -79,15 +84,16 @@ class GPUKernelRuntime : public KernelRuntime {
const DeviceAddressPtrList addr_list, size_t total_size,
std::vector<size_t> size_list);
void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel);
bool AddMemorySwapTask(const AnfNodePtr &kernel);
bool UpdateMemorySwapInfo(const session::KernelGraph *graph);
bool UpdateMemorySwapTask(const AnfNodePtr &kernel);
void UpdateHostSwapQueue(const DeviceAddressPtr device_address);
void UpdateDeviceSwapQueue();
void ClearSwapQueue();
bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling);
void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock);
void UpdateHostSwapOutQueue(bool mock);
void ClearSwapInfo(bool mock);
std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
std::unordered_map<uint32_t, bool> is_first_step_map_;
std::unordered_map<uint32_t, std::set<AnfNodePtr>> graph_output_map_;
MemReuseUtilPtr mem_reuse_util_{nullptr};
MemSwapManagerPtr mem_swap_manager_{nullptr};
};

@ -47,11 +47,20 @@ void GPUMemCopyManager::AddMemSwapOutTask(const DeviceAddressPtr &device_address
swap_out_queue_.emplace(device_address, event);
}
void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {
void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr,
bool profiling, float *cost_time) {
MS_EXCEPTION_IF_NULL(device_address);
MS_EXCEPTION_IF_NULL(host_addr.addr);
DeviceEvent event = nullptr;
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&event, cudaEventDisableTiming), "Failed to create CUDA event.");
DeviceEvent start = nullptr;
DeviceEvent end = nullptr;
if (profiling) {
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&start), "Failed to create CUDA event.");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end), "Failed to create CUDA event.");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(start, swap_in_stream_),
"Failed to record CUDA event to swap in stream.");
} else {
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end, cudaEventDisableTiming), "Failed to create CUDA event.");
}
DeviceMemPtr device_ptr = const_cast<DeviceMemPtr>(device_address->GetPtr());
MS_EXCEPTION_IF_NULL(device_ptr);
device_address->set_status(DeviceAddressStatus::kInHostToDevice);
@ -59,9 +68,27 @@ void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address,
CHECK_OP_RET_WITH_EXCEPT(
CudaDriver::CopyHostMemToDeviceAsync(device_ptr, host_addr.addr, host_addr.size, swap_in_stream_),
"Failed to copy host memory to device.");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(event, swap_in_stream_),
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(end, swap_in_stream_),
"Failed to record CUDA event to swap in stream.");
swap_in_queue_.emplace(device_address, event);
if (profiling) {
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(start), "Failed to sync event.");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(end), "Failed to sync event.");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::ElapsedTime(cost_time, start, end), "Failed to record elapsed time.");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(start), "Failed to destroy event.");
}
swap_in_queue_.emplace(device_address, end);
}
void GPUMemCopyManager::AddMemSwapOutTaskMock(const DeviceAddressPtr &device_address) {
MS_EXCEPTION_IF_NULL(device_address);
device_address->set_status(DeviceAddressStatus::kInDeviceToHost);
swap_out_queue_mock_.emplace(device_address);
}
void GPUMemCopyManager::AddMemSwapInTaskMock(const DeviceAddressPtr &device_address) {
MS_EXCEPTION_IF_NULL(device_address);
device_address->set_status(DeviceAddressStatus::kInHostToDevice);
swap_in_queue_mock_.emplace(device_address);
}
bool GPUMemCopyManager::SyncMemCopyStream(SwapKind swap_kind) {
@ -104,6 +131,24 @@ DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueue() {
return device_address;
}
DeviceAddressPtr GPUMemCopyManager::UpdateSwapOutQueueMock() {
if (swap_out_queue_mock_.empty()) {
return nullptr;
}
auto device_address = swap_out_queue_mock_.front();
swap_out_queue_mock_.pop();
return device_address;
}
DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueueMock() {
if (swap_in_queue_mock_.empty()) {
return nullptr;
}
auto device_address = swap_in_queue_mock_.front();
swap_in_queue_mock_.pop();
return device_address;
}
bool GPUMemCopyManager::AllocHostPinnedMem(size_t size, void **addr) const {
auto alloc_size = CudaDriver::AllocHostPinnedMem(size, addr);
return alloc_size == size;
@ -126,6 +171,15 @@ void GPUMemCopyManager::ClearSwapQueue() {
swap_in_queue_.pop();
}
}
void GPUMemCopyManager::ClearSwapQueueMock() {
while (!swap_out_queue_mock_.empty()) {
swap_out_queue_mock_.pop();
}
while (!swap_in_queue_mock_.empty()) {
swap_in_queue_mock_.pop();
}
}
} // namespace gpu
} // namespace device
} // namespace mindspore

@ -40,7 +40,12 @@ class GPUMemCopyManager : public MemCopyManager {
void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override;
void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override;
void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr, bool profiling,
float *cost_time) override;
void AddMemSwapOutTaskMock(const DeviceAddressPtr &device_address) override;
void AddMemSwapInTaskMock(const DeviceAddressPtr &device_address) override;
bool SyncMemCopyStream(SwapKind swap_kind) override;
@ -48,17 +53,25 @@ class GPUMemCopyManager : public MemCopyManager {
DeviceAddressPtr UpdateSwapInQueue() override;
DeviceAddressPtr UpdateSwapOutQueueMock() override;
DeviceAddressPtr UpdateSwapInQueueMock() override;
bool AllocHostPinnedMem(size_t size, void **addr) const override;
void FreeHostPinnedMem(void *addr) const override;
void ClearSwapQueue() override;
void ClearSwapQueueMock() override;
private:
DeviceStream swap_out_stream_{nullptr};
DeviceStream swap_in_stream_{nullptr};
std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_out_queue_;
std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_in_queue_;
std::queue<DeviceAddressPtr> swap_out_queue_mock_;
std::queue<DeviceAddressPtr> swap_in_queue_mock_;
};
using GPUMemCopyManagerPtr = std::shared_ptr<GPUMemCopyManager>;
} // namespace gpu

@ -355,7 +355,7 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1):
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_trainTensor_big_batchSize(num_classes=10, epoch=8, batch_size=170):
def test_trainTensor_big_batchSize(num_classes=10, epoch=8, batch_size=338):
net = resnet50(num_classes)
lr = 0.1
momentum = 0.9

Loading…
Cancel
Save