dygraph nccl init support host domain name (#28107)

* nccl init support hostname and ip; test=develop
revert-27871-prv-conv-grad-opt
danleifeng 5 years ago committed by GitHub
parent 5cd97a1cb0
commit f29fb396df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
char *ip = NULL;
struct hostent *hp;
if ((hp = gethostbyname(host.c_str())) == NULL) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));
}
int i = 0;
while (hp->h_addr_list[i] != NULL) {
ip = inet_ntoa(*(struct in_addr *)hp->h_addr_list[i]);
VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip;
break;
}
if (inet_pton(AF_INET, ip, &serv_addr.sin_addr) <= 0) {
PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
}

@ -16,6 +16,7 @@
// network header files
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include <arpa/inet.h>
#include <netdb.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>

@ -20,7 +20,7 @@ namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];

Loading…
Cancel
Save