You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
graphengine/ge/graph/manager/memory_api.cc

119 lines
4.5 KiB

/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "framework/memory/memory_api.h"
#include <memory>
#include "common/ge/plugin_manager.h"
#include "graph/manager/graph_mem_allocator.h"
#include "graph/manager/host_mem_manager.h"
#include "graph/manager/rdma_pool_allocator.h"
#include "graph/utils/type_utils.h"
#include "hccl/base.h"
#include "hccl/hccl_types.h"
namespace ge {
Status InitRdmaPool(size_t size, rtMemType_t mem_type) {
GELOGD("InitRdmaPool in");
return MemManager::Instance().RdmaPoolInstance(mem_type).InitMemory(size);
}
Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t mem_type) {
GELOGD("Start to register rdma memory with host var size %zu", var_info.size());
uint64_t device_base = 0;
uint64_t device_size = 0;
GE_CHK_STATUS_RET(MemManager::Instance().RdmaPoolInstance(mem_type).GetBaseAddr(device_base, device_size));
auto table_len = var_info.size() + 1;
std::unique_ptr<MemRegisterAddr[]> reg_addrs(new (std::nothrow) MemRegisterAddr[table_len]);
GE_CHECK_NOTNULL(reg_addrs);
for (size_t i = 0; i < var_info.size(); ++i) {
reg_addrs[i] = {var_info[i].base_addr, var_info[i].var_size};
}
reg_addrs[table_len - 1] = {device_base, device_size};
std::string file_name = "libhccl.so";
std::string path = PluginManager::GetPath();
path.append(file_name);
string canonical_path = RealPath(path.c_str());
if (canonical_path.empty()) {
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid",
canonical_path.c_str());
GELOGE(FAILED, "Failed to get realpath of %s", path.c_str());
return FAILED;
}
GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str());
auto handle = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
GE_CHECK_NOTNULL(handle);
GE_MAKE_GUARD(not_used_var, [&] {
if (dlclose(handle) != 0) {
GELOGW("Failed to close handle %s", dlerror());
}
});
auto hcom_remote_mem_register =
(HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "HcomRegRemoteAccessMem");
if (hcom_remote_mem_register == nullptr) {
REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid",
canonical_path.c_str());
GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function.");
return FAILED;
}
HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len);
if (hccl_ret != HCCL_SUCCESS) {
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,",
hccl_ret);
GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret);
return HCCL_E_INTERNAL;
}
return SUCCESS;
}
Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size) {
GELOGD("MallocSharedMemory in");
uint32_t type_size = 0;
bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size);
if (!result) {
GELOGE(GRAPH_FAILED, "GetDataTypeLength failed, data_type=(%s).",
TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str());
return GRAPH_FAILED;
}
memory_size = type_size;
for (auto dim : tensor_info.dims) {
if (dim <= 0) {
GELOGE(GRAPH_FAILED, "Tensor dims should be positive");
return GRAPH_FAILED;
}
memory_size *= dim;
}
SharedMemInfo mem_info(tensor_info.var_name, memory_size);
Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info);
if (ret != SUCCESS) {
GELOGE(GRAPH_FAILED, "MallocSharedMemory failed op name [%s]", tensor_info.var_name.c_str());
return GRAPH_FAILED;
}
dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address));
GELOGD("MallocSharedMemory Succeeded");
return SUCCESS;
}
Status GetVarBaseAddrAndSize(const string &var_name, uint64_t &base_addr, uint64_t &var_size) {
GELOGD("GetVarBaseAddrAndSize in");
return HostMemManager::Instance().QueryVarMemInfo(var_name, base_addr, var_size);
}
} // namespace ge