|
|
|
@ -19,6 +19,7 @@
|
|
|
|
|
#include "hccl/hccl.h"
|
|
|
|
|
|
|
|
|
|
constexpr auto kHcclConfigFile = "MINDSPORE_HCCL_CONFIG_PATH";
|
|
|
|
|
constexpr auto kHcclConfigFileOld = "RANK_TABLE_FILE";
|
|
|
|
|
|
|
|
|
|
namespace mindspore {
|
|
|
|
|
namespace kernel {
|
|
|
|
@ -37,10 +38,25 @@ bool HcclContext::InitHccl() {
|
|
|
|
|
}
|
|
|
|
|
auto config_file = std::getenv(kHcclConfigFile);
|
|
|
|
|
if (config_file == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Get hccl config file failed";
|
|
|
|
|
config_file = std::getenv(kHcclConfigFileOld);
|
|
|
|
|
if (config_file == nullptr) {
|
|
|
|
|
MS_LOG(ERROR) << "Get hccl rank table file failed. Please export MINDSPORE_HCCL_CONFIG_PATH or RANK_TABLE_FILE";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto rank_id = GetRankId();
|
|
|
|
|
try {
|
|
|
|
|
rank_id_ = std::stoi(rank_id);
|
|
|
|
|
} catch (std::invalid_argument &e) {
|
|
|
|
|
MS_LOG(ERROR) << "Invalid rankd id env:" << rank_id;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rank_id_ < 0 || rank_id_ > 7) {
|
|
|
|
|
MS_LOG(ERROR) << "rank_id needs to be between 0-7";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
rank_id_ = std::stoi(GetRankId());
|
|
|
|
|
|
|
|
|
|
auto hccl_result = HcclCommInitClusterInfo(config_file, rank_id_, &hccl_comm_);
|
|
|
|
|
if (hccl_result != HCCL_SUCCESS) {
|
|
|
|
|