diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc index 3a1a53c600..fceb5e78b4 100644 --- a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc +++ b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.cc @@ -14,21 +14,29 @@ * limitations under the License. */ +#include #include "device/gpu/gpu_memory_allocator.h" #include "device/gpu/cuda_driver.h" #include "utils/log_adapter.h" +#include "utils/context/ms_context.h" +#include "utils/convert_utils_base.h" namespace mindspore { namespace device { namespace gpu { bool GPUMemoryAllocator::Init() { size_t total_size = total_mem_size(); - size_t free_size = free_mem_size(); - if (total_size > 0 && free_size > 0) { - MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size; + size_t free_size = CudaDriver::free_mem_size(); + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + float max_device_memory = context_ptr->max_device_memory(); + max_available_device_memory_ = FloatToSize(max_device_memory * 1024 * 1024 * 1024); + if (total_size > 0 && free_size > 0 && max_available_device_memory_ > 0) { + MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size + << ", set max available memory size " << max_available_device_memory_; } else { MS_LOG(EXCEPTION) << "GPU device memory error, total memory size " << total_size << ", current free memory size " - << free_size; + << free_size << ", set max available memory size " << max_available_device_memory_; } return true; } @@ -64,13 +72,18 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { if (alloc_size == 0) { MS_LOG(EXCEPTION) << "Alloc device memory[" << size << "] failed."; } - MS_LOG(INFO) << "Current free memory size[" << free_size << "], current alloc size[" << alloc_size << "]."; + total_used_device_memory_ += alloc_size; + max_available_device_memory_ -= alloc_size; + MS_LOG(INFO) << "Current free memory size[" << free_size - alloc_size << "], current alloc size[" << alloc_size + << "], total used size[" << total_used_device_memory_ << "]."; return alloc_size; } bool GPUMemoryAllocator::FreeDeviceMem(const DeviceMemPtr &addr) { return CudaDriver::FreeDeviceMem(addr); } -size_t GPUMemoryAllocator::free_mem_size() { return CudaDriver::free_mem_size(); } +size_t GPUMemoryAllocator::free_mem_size() { + return std::min(CudaDriver::free_mem_size(), max_available_device_memory_); +} size_t GPUMemoryAllocator::total_mem_size() { return CudaDriver::total_mem_size(); } } // namespace gpu diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h index 36374bfaad..baaf50b641 100644 --- a/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h +++ b/mindspore/ccsrc/device/gpu/gpu_memory_allocator.h @@ -48,6 +48,9 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit { // Used to track address of data buffer queue. DeviceMemPtr buffer_q_addr_{nullptr}; + + size_t total_used_device_memory_{0}; + size_t max_available_device_memory_{0}; }; } // namespace gpu } // namespace device diff --git a/mindspore/ccsrc/pipeline/init.cc b/mindspore/ccsrc/pipeline/init.cc index 37faf7decc..0992b7fa66 100644 --- a/mindspore/ccsrc/pipeline/init.cc +++ b/mindspore/ccsrc/pipeline/init.cc @@ -143,7 +143,9 @@ PYBIND11_MODULE(_c_expression, m) { .def("get_profiling_options", &mindspore::MsContext::profiling_options, "Get options to profiling.") .def("set_profiling_options", &mindspore::MsContext::set_profiling_options, "Set options to profiling.") .def("get_check_bprop_flag", &mindspore::MsContext::check_bprop_flag, "Get whether to check bprop.") - .def("set_check_bprop_flag", &mindspore::MsContext::set_check_bprop_flag, "Set whether to check bprop."); + .def("set_check_bprop_flag", &mindspore::MsContext::set_check_bprop_flag, "Set whether to check bprop.") + .def("get_max_device_memory", &mindspore::MsContext::max_device_memory, "Get deivce memory max size.") + .def("set_max_device_memory", &mindspore::MsContext::set_max_device_memory, "Set deivce memory max size."); (void)py::class_>(m, "AutoParallelContext") .def_static("get_instance", &ParallelContext::GetInstance, "Get auto parallel context instance.") diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc index a726d79cd5..7440bd38f7 100644 --- a/mindspore/ccsrc/utils/context/ms_context.cc +++ b/mindspore/ccsrc/utils/context/ms_context.cc @@ -81,6 +81,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { profiling_mode_ = false; profiling_options_ = "training_trace"; check_bprop_flag_ = false; + max_device_memory_ = kDefaultMaxDeviceMemory; } std::shared_ptr MsContext::GetInstance() { diff --git a/mindspore/ccsrc/utils/context/ms_context.h b/mindspore/ccsrc/utils/context/ms_context.h index e7be85dc82..7a3da24acb 100644 --- a/mindspore/ccsrc/utils/context/ms_context.h +++ b/mindspore/ccsrc/utils/context/ms_context.h @@ -44,6 +44,8 @@ const char kAscendDevice[] = "Ascend"; const char kDavinciDevice[] = "Davinci"; const char KNpuLog[] = "_npu_log"; const std::set kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice}; +// The default max available device memory is 1024GB. +const float kDefaultMaxDeviceMemory = 1024; class MsContext { public: @@ -143,6 +145,9 @@ class MsContext { bool check_bprop_flag() const { return check_bprop_flag_; } void set_check_bprop_flag(bool check_bprop_flag) { check_bprop_flag_ = check_bprop_flag; } + float max_device_memory() const { return max_device_memory_; } + void set_max_device_memory(float max_device_memory) { max_device_memory_ = max_device_memory; } + private: MsContext(const std::string &backend_policy, const std::string &target); void GetGeOptions(std::map *ge_options) const; @@ -182,6 +187,7 @@ class MsContext { bool profiling_mode_; std::string profiling_options_; bool check_bprop_flag_; + float max_device_memory_; }; } // namespace mindspore diff --git a/mindspore/context.py b/mindspore/context.py index 1887363d5a..35f671a1c6 100644 --- a/mindspore/context.py +++ b/mindspore/context.py @@ -332,6 +332,17 @@ class _Context: def check_bprop(self, check_bprop_flag): self._context_handle.set_check_bprop_flag(check_bprop_flag) + @property + def max_device_memory(self): + return self._context_handle.get_max_device_memory() + + @max_device_memory.setter + def max_device_memory(self, max_device_memory): + if not check_input_format(max_device_memory): + raise ValueError("Context param max_device_memory should be in correct format! Such as \"3.5GB\"") + max_device_memory_value = float(max_device_memory[:-2]) + self._context_handle.set_max_device_memory(max_device_memory_value) + def check_input_format(x): import re pattern = r'[1-9][0-9]*(\.)?[0-9]*GB|0\.[0-9]*GB' @@ -459,7 +470,7 @@ def reset_auto_parallel_context(): save_graphs_path=str, save_ms_model=bool, save_ms_model_path=str, enable_dump=bool, save_dump_path=str, enable_reduce_precision=bool, variable_memory_max_size=str, enable_profiling=bool, profiling_options=str, enable_auto_mixed_precision=bool, - check_bprop=bool) + check_bprop=bool, max_device_memory=str) def set_context(**kwargs): """ Sets context for running environment. @@ -511,6 +522,7 @@ def set_context(**kwargs): separated by colons; single operator can choose op_trace, op_trace cannot be combined with training_trace and task_trace. Default: "training_trace". check_bprop (bool): Whether to check bprop. Default: False. + max_device_memory (str): Sets the maximum memory available for device. Default: "1024GB". Raises: ValueError: If input key is not an attribute in context. @@ -530,6 +542,7 @@ def set_context(**kwargs): >>> device_target="Ascend",device_id=0, save_graphs=True, >>> save_graphs_path="/mindspore") >>> context.set_context(enable_profiling=True, profiling_options="training_trace") + >>> context.set_context(max_device_memory="3.5GB") """ for key, value in kwargs.items(): if not hasattr(_context(), key):