|
|
|
@ -14,7 +14,7 @@ limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/operators/collective/send_v2_op.h"
|
|
|
|
|
|
|
|
|
|
#if defined(PADDLE_WITH_NCCL)
|
|
|
|
|
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
|
|
|
|
|
#include "paddle/fluid/platform/collective_helper.h"
|
|
|
|
|
#include "paddle/fluid/platform/nccl_helper.h"
|
|
|
|
|
#endif
|
|
|
|
@ -26,7 +26,8 @@ template <typename T>
|
|
|
|
|
class SendOpV2CUDAKernel : public framework::OpKernel<T> {
|
|
|
|
|
public:
|
|
|
|
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
|
|
|
|
#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
|
|
|
|
|
#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
|
|
|
|
|
NCCL_VERSION_CODE >= 2703
|
|
|
|
|
auto x = ctx.Input<framework::LoDTensor>("X");
|
|
|
|
|
int numel = x->numel();
|
|
|
|
|
|
|
|
|
@ -41,7 +42,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
|
|
|
|
|
peer, 0,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
|
"The peer (%d) for send_v2 op must be non-negative.", peer));
|
|
|
|
|
cudaStream_t stream = nullptr;
|
|
|
|
|
gpuStream_t stream = nullptr;
|
|
|
|
|
auto place = ctx.GetPlace();
|
|
|
|
|
auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
|
|
|
|
|
if (ctx.Attr<bool>("use_calc_stream")) {
|
|
|
|
@ -59,9 +60,15 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
|
|
|
|
|
// Send number of elements to the receiver, as the receiver may have
|
|
|
|
|
// no information of the Tensor size.
|
|
|
|
|
int* numel_ptr = nullptr;
|
|
|
|
|
#ifdef PADDLE_WITH_RCCL
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&numel_ptr, sizeof(int)));
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
hipMemcpy(numel_ptr, &numel, sizeof(int), hipMemcpyHostToDevice));
|
|
|
|
|
#else
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int)));
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
cudaMemcpy(numel_ptr, &numel, sizeof(int), cudaMemcpyHostToDevice));
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
|
|
|
|
|
numel_ptr, 1, ncclInt, peer, comm->comm(), stream));
|
|
|
|
|