!3246 refine gpu memory swap performance

Merge pull request !3246 from zyli2020/refine_gpu_mem_swap
pull/3246/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit cfafdcbcf0

@ -19,6 +19,7 @@
#include <vector>
#include <map>
#include <set>
#include <queue>
#include <memory>
#include <utility>
@ -40,29 +41,58 @@ struct TensorInfo {
struct KernelExecutionInfo {
size_t topo_order_{0};
float execution_perform_{0.0};
bool trigger_swap_{false};
bool need_swap_{false};
// output index to topo orders of node users
bool trigger_swap_out_{false};
bool trigger_swap_in_{false};
size_t swap_in_task_num_{0};
// Key: output index, value: topo orders of node users
std::map<size_t, std::vector<size_t>> node_users_map_;
// kernel output idx to host addr
std::map<size_t, HostAddress> host_addrs_;
// Key: output idx, value: (host addr, dirty or not)
std::map<size_t, std::pair<HostAddress, bool>> host_addrs_;
KernelExecutionInfo() : KernelExecutionInfo(0, 0.0, false, false) {}
explicit KernelExecutionInfo(size_t topo_order)
: topo_order_(topo_order), execution_perform_(0.0), trigger_swap_(false), need_swap_(false) {}
KernelExecutionInfo(size_t topo_order, float execution_perform, bool trigger_swap, bool need_swap)
KernelExecutionInfo() {}
explicit KernelExecutionInfo(size_t topo_order) : KernelExecutionInfo(topo_order, 0.0, false, false, 0) {}
KernelExecutionInfo(size_t topo_order, float execution_perform, bool trigger_swap_out, bool trigger_swap_in,
size_t swap_in_task_num)
: topo_order_(topo_order),
execution_perform_(execution_perform),
trigger_swap_(trigger_swap),
need_swap_(need_swap) {}
trigger_swap_out_(trigger_swap_out),
trigger_swap_in_(trigger_swap_in),
swap_in_task_num_(swap_in_task_num) {}
};
// trigger swap
struct MemSwapInfo {
SwapKind swap_kind_;
// kernel need to be swapped
AnfNodePtr kernel_{nullptr};
// Topo order of kernel need be swapped
size_t topo_order_;
size_t output_idx_{0};
// Record the swapping out position of swapping in tensor
size_t swap_out_pos_;
};
struct SwapInfoComp {
bool operator()(const MemSwapInfo &a, const MemSwapInfo &b) {
int swap_kind_a = static_cast<int>(a.swap_kind_);
int swap_kind_b = static_cast<int>(b.swap_kind_);
if (swap_kind_a < swap_kind_b) {
return true;
} else if (swap_kind_a > swap_kind_b) {
return false;
}
if (a.swap_out_pos_ < b.swap_out_pos_) {
return true;
} else if (a.swap_out_pos_ > b.swap_out_pos_) {
return false;
}
if (a.topo_order_ < b.topo_order_) {
return true;
} else if (a.topo_order_ > b.topo_order_) {
return false;
}
return a.output_idx_ < b.output_idx_;
}
};
class MemCopyManager {
@ -90,6 +120,7 @@ class MemCopyManager {
virtual void ClearSwapQueue() {}
};
using MemCopyManagerPtr = std::shared_ptr<MemCopyManager>;
using MemSwapInfoSet = std::set<MemSwapInfo, SwapInfoComp>;
} // namespace memswap
} // namespace device
} // namespace mindspore

@ -32,7 +32,11 @@ namespace memswap {
class MemSwapManager {
public:
explicit MemSwapManager(const MemCopyManagerPtr &mem_copy_manager)
: tensor_size_threshold_(0), tensor_size_threshold_idx_(0), tensor_size_num_(1), distance_threshold_(1) {
: tensor_size_threshold_(0),
tensor_size_threshold_idx_(0),
tensor_size_num_(1),
distance_threshold_(1),
distance_decay_step_(1) {
mem_copy_manager_ = mem_copy_manager;
}
@ -42,7 +46,7 @@ class MemSwapManager {
~MemSwapManager() = default;
void Init(const mindspore::session::KernelGraph *kernel_graph);
bool Init(const mindspore::session::KernelGraph *kernel_graph, size_t swap_mem_size = 0);
void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address,
const HostAddress &host_address) const;
@ -51,9 +55,10 @@ class MemSwapManager {
DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind) const;
// retreat to find a workable swap scheme
bool RetreatSwapInfo();
void AdjustSwapInPos(const AnfNodePtr &kernel, size_t index);
bool trigger_swap() const { return trigger_swap_; }
bool mem_swap_init() const { return mem_swap_initialized_; }
@ -70,16 +75,28 @@ class MemSwapManager {
bool QueryKernelTriggerSwap(const AnfNodePtr &kernel) const;
bool QueryKernelNeedSwap(const AnfNodePtr &kernel) const;
bool QueryKernelTriggerSwapIn(const AnfNodePtr &kernel) const;
size_t QueryKernelTriggerSwapInTaskNum(const AnfNodePtr &kernel) const;
const AnfNodePtr QueryKerneByTopoOrder(size_t index) const;
const MemSwapInfoSet &QueryKernelMemSwapInfo(const AnfNodePtr &kernel) const;
void AssignHostMemory();
const std::vector<MemSwapInfo> &QueryKernelMemSwapInfo(const AnfNodePtr &kernel) const;
const HostAddress &QueryKernelHostAddr(const AnfNodePtr &kernel, size_t output_idx) const;
void AddKernelHostAddrIsDirty(const AnfNodePtr &kernel, size_t output_idx, bool dirty);
bool QueryKernelHostAddrIsDirty(const AnfNodePtr &kernel, size_t output_idx) const;
void ResetHostAddrIsDirty();
void InsertSwapInBlackList(const void *device_ptr);
bool FindInSwapInBlackList(const void *device_ptr) const;
const HostAddress &kernel_host_addr(const AnfNodePtr &kernel, size_t output_idx) const;
bool AllocHostPinnedMem(size_t size, void **addr) const;
void ReleaseHostPinnedMem();
@ -93,27 +110,47 @@ class MemSwapManager {
void SaveUserKernelTopoOrder();
void AddKernelTriggerSwap(const AnfNodePtr &kernel, bool trigger_swap);
bool InitSwapThreshold(size_t swap_mem_size);
void AddKernelNeedSwap(const AnfNodePtr &kernel, bool need_swap);
void RetreatSwapThreshold();
void CacheCurSwapInfoSet(const AnfNodePtr &kernel);
void AddFirstTimeMovePos(const AnfNodePtr &kernel, size_t index, bool first_time);
bool QueryFirstTimeMovePos(const AnfNodePtr &kernel, size_t index) const;
size_t BestSwapInPerformPos(const AnfNodePtr &trigger_kernel, const MemSwapInfo &mem_swap_info) const;
void MoveSwapInfoPos(size_t des_pos, size_t src_pos, const MemSwapInfo &mem_swap_info);
void AddKernelMemSwapInfo(const AnfNodePtr &kernel, const MemSwapInfo &mem_swap_info);
void RemoveKernelMemSwapInfo(const AnfNodePtr &kernel, const MemSwapInfo &mem_swap_info);
bool CheckDistanceBetweenKernels(const TensorInfo &tensor_info) const;
bool IsCommunicationRelevantOp(const AnfNodePtr &kernel) const;
std::vector<CNodePtr> execution_order_;
std::vector<TensorInfo> ordered_tensors_;
std::unordered_map<void *, KernelExecutionInfo> kernel_execution_info_;
std::unordered_map<void *, std::map<size_t, PerformPair>> kernel_swap_perform_;
// trigger swap kernel key : MemSwapInfo of kernel need to be swapped
std::unordered_map<void *, std::vector<MemSwapInfo>> mem_swap_info_;
// Key: trigger swap kernel, value: MemSwapInfoSet of kernel need to be swapped
std::unordered_map<void *, MemSwapInfoSet> mem_swap_info_map_;
std::vector<HostAddress> host_addrs_list_;
std::unordered_set<const void *> swap_in_blacklist_;
// Key: cache kernel address, value: lists of first time move pos or not
std::map<void *, std::vector<bool>> kernel_first_move_cache_map_;
std::vector<MemSwapInfo> mem_swap_info_cache_list_;
std::pair<size_t, size_t> best_and_cur_pos_cache_;
size_t tensor_size_threshold_;
size_t tensor_size_threshold_idx_;
size_t tensor_size_num_;
size_t distance_threshold_;
size_t distance_decay_step_;
MemCopyManagerPtr mem_copy_manager_{nullptr};
FuncGraphManagerPtr graph_manager_{nullptr};

@ -707,6 +707,18 @@ DeviceAddress *AnfRuntimeAlgorithm::GetWorkspaceAddr(const AnfNodePtr &node, siz
return addr;
}
// get workspace device mutable addr of anf_node
DeviceAddressPtr AnfRuntimeAlgorithm::GetMutableWorkspaceAddr(const AnfNodePtr &node, size_t index) {
MS_EXCEPTION_IF_NULL(node);
auto kernel_info = dynamic_cast<device::KernelInfo *>(node->kernel_info());
MS_EXCEPTION_IF_NULL(kernel_info);
auto addr = kernel_info->GetMutableWorkspaceAddr(index);
if (addr == nullptr) {
MS_LOG(EXCEPTION) << "Index " << index << " of node " << node->DebugString() << "] workspace addr is not exist";
}
return addr;
}
// set infer shapes and types of anf node
void AnfRuntimeAlgorithm::SetOutputInferTypeAndShape(const std::vector<TypeId> &types,
const std::vector<std::vector<size_t>> &shapes, AnfNode *node) {

@ -149,6 +149,8 @@ class AnfRuntimeAlgorithm {
static void SetWorkspaceAddr(const DeviceAddressPtr &addr, size_t output_idx, AnfNode *node);
// get workspace device addr of anf_node
static DeviceAddress *GetWorkspaceAddr(const AnfNodePtr &node, size_t output_idx);
// get workspace device mutable addr of anf_node
static DeviceAddressPtr GetMutableWorkspaceAddr(const AnfNodePtr &node, size_t index);
// set infer shapes and types of anf node
static void SetOutputInferTypeAndShape(const std::vector<TypeId> &types,
const std::vector<std::vector<size_t>> &shapes, AnfNode *node);

@ -209,6 +209,16 @@ bool CudaDriver::QueryEvent(const DeviceEvent &event) {
}
}
bool CudaDriver::ElapsedTime(float *cost_time, const DeviceEvent &start, const DeviceEvent &end) {
auto ret = cudaEventElapsedTime(cost_time, (cudaEvent_t)start, (cudaEvent_t)end);
if (ret == cudaSuccess) {
return true;
} else {
MS_LOG(ERROR) << "cudaEventElapsedTime failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
return false;
}
}
int CudaDriver::device_count() {
int dev_count;
auto ret = cudaGetDeviceCount(&dev_count);

@ -57,6 +57,7 @@ class CudaDriver {
static bool RecordEvent(DeviceEvent event, DeviceStream stream = 0);
static bool SyncEvent(const DeviceEvent &event);
static bool QueryEvent(const DeviceEvent &event);
static bool ElapsedTime(float *cost_time, const DeviceEvent &start, const DeviceEvent &end);
// Encapsulate the cuda APIs associated with device management.
static int device_count();

File diff suppressed because it is too large Load Diff

@ -53,11 +53,17 @@ class GPUKernelRuntime : public KernelRuntime {
// The related functions and members for using dynamic memory pool.
void InitKernelRefCount(const session::KernelGraph *graph);
void InitKernelOutputAddress(const session::KernelGraph *graph);
void InitKernelWorkspaceAddress(const session::KernelGraph *graph);
void InitMemorySwapInfo(const session::KernelGraph *graph);
void ClearKernelOutputAddress(const session::KernelGraph *graph);
bool LaunchKernelDynamic(const session::KernelGraph *graph);
void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
bool SearchMemSwapScheme(const session::KernelGraph *graph);
bool RefineMemSwapScheme(const session::KernelGraph *graph);
bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false);
void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
const AddressPtrList &workspace, const AddressPtrList &outputs);
bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size);
void *AttemptMallocMem(size_t size);
bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces,
AddressPtrList *kernel_outputs);
@ -72,7 +78,7 @@ class GPUKernelRuntime : public KernelRuntime {
void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
const DeviceAddressPtrList addr_list, size_t total_size,
std::vector<size_t> size_list);
void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, const AddressPtrList &kernel_workspaces);
void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel);
bool AddMemorySwapTask(const AnfNodePtr &kernel);
bool UpdateMemorySwapInfo(const session::KernelGraph *graph);
bool UpdateMemorySwapTask(const AnfNodePtr &kernel);
@ -81,6 +87,7 @@ class GPUKernelRuntime : public KernelRuntime {
void ClearSwapQueue();
std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
std::unordered_map<uint32_t, bool> is_first_step_map_;
MemReuseUtilPtr mem_reuse_util_{nullptr};
MemSwapManagerPtr mem_swap_manager_{nullptr};
};

@ -73,6 +73,14 @@ DeviceAddress *KernelInfo::GetWorkspaceAddr(size_t index) const {
return workspace_address_list_[index].get();
}
DeviceAddressPtr KernelInfo::GetMutableWorkspaceAddr(size_t index) const {
if (index >= workspace_address_list_.size()) {
MS_LOG(ERROR) << "Index [" << index << "] out of range";
return nullptr;
}
return workspace_address_list_[index];
}
bool KernelInfo::SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index) {
if (workspace_address_list_.empty()) {
// parameter and valuenode

@ -54,6 +54,7 @@ class KernelInfo : public KernelInfoDevice {
bool OutputAddrExist(size_t index) const;
bool SetOutputAddr(const DeviceAddressPtr &output_address, size_t index);
DeviceAddress *GetWorkspaceAddr(size_t index) const;
DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const;
bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index);
void set_kernel_mod(const kernel::KernelModPtr &kernel_mod);
kernel::KernelMod *MutableKernelMod() const;

Loading…
Cancel
Save