[ROCM] update fluid collective op for rocm, test=develop (#31075)

revert-31068-fix_conv3d_windows
Qi Li 4 years ago committed by GitHub
parent d8fa65a3a8
commit ee76ea72de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda.h>
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"

@ -77,7 +77,7 @@ void OpTester::Run() {
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
platform::EnableProfiler(platform::ProfilerState::kCPU); platform::EnableProfiler(platform::ProfilerState::kCPU);
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::EnableProfiler(platform::ProfilerState::kAll); platform::EnableProfiler(platform::ProfilerState::kAll);
platform::SetDeviceId(config_.device_id); platform::SetDeviceId(config_.device_id);
#else #else

@ -13,7 +13,7 @@ endforeach()
register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
if(WITH_NCCL) if(WITH_NCCL OR WITH_RCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})

@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -36,7 +36,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now.")); "AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
@ -73,7 +73,11 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type, sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
comm, stream)); comm, stream));
if (ctx.Attr<bool>("sync_mode")) { if (ctx.Attr<bool>("sync_mode")) {
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
} }
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h" #include "paddle/fluid/operators/collective/barrier_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,7 @@ template <typename T>
class BarrierOpCUDAKernel : public framework::OpKernel<T> { class BarrierOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
@ -45,7 +45,11 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
auto comm_stream = auto comm_stream =
platform::NCCLCommContext::Instance().Get(rid, place)->stream(); platform::NCCLCommContext::Instance().Get(rid, place)->stream();
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(comm_stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
#endif
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with NCCL.")); "PaddlePaddle should compile with NCCL."));

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -33,7 +33,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The place of ExecutionContext should be CUDAPlace.")); "The place of ExecutionContext should be CUDAPlace."));
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
int root_dev_id = ctx.Attr<int>("root"); int root_dev_id = ctx.Attr<int>("root");
@ -62,7 +62,11 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
<< " From " << root_dev_id << " to " << dev_id; << " From " << root_dev_id << " to " << dev_id;
if (ctx.Attr<bool>("sync_mode")) { if (ctx.Attr<bool>("sync_mode")) {
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
} }
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_allgather_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,7 @@ template <typename T>
class CAllGatherOpCUDAKernel : public framework::OpKernel<T> { class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
@ -48,7 +48,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
const T* send_buff = in->data<T>(); const T* send_buff = in->data<T>();
T* recv_buff = out->data<T>(); T* recv_buff = out->data<T>();
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -109,7 +109,7 @@ template <ReduceType red_type, typename T>
class CAllReduceOpCUDAKernel : public framework::OpKernel<T> { class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
@ -123,7 +123,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
int rid = ctx.Attr<int>("ring_id"); int rid = ctx.Attr<int>("ring_id");
auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,7 @@ template <typename T>
class CBroadcastOpCUDAKernel : public framework::OpKernel<T> { class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto x = ctx.Input<framework::LoDTensor>("X"); auto x = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out"); auto out = ctx.Output<framework::LoDTensor>("Out");
int numel = x->numel(); int numel = x->numel();
@ -36,7 +36,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -52,7 +52,7 @@ class CCommInitAllOp : public framework::OperatorBase {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"CCommInitAllOp can run on gpu place only")); "CCommInitAllOp can run on gpu place only"));
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::vector<int> devices = Attr<std::vector<int>>("devices"); std::vector<int> devices = Attr<std::vector<int>>("devices");
if (devices.empty()) { if (devices.empty()) {
devices = platform::GetSelectedDevices(); devices = platform::GetSelectedDevices();

@ -14,6 +14,9 @@ limitations under the License. */
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include <nccl.h> #include <nccl.h>
#endif #endif
#if defined(PADDLE_WITH_RCCL)
#include <rccl.h>
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h" #include "xpu/bkcl.h"
#endif #endif
@ -26,7 +29,8 @@ namespace framework {
class Scope; class Scope;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
@ -50,7 +54,7 @@ class CCommInitOp : public framework::OperatorBase {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::InvalidArgument("Input con not be empty.")); var, platform::errors::InvalidArgument("Input con not be empty."));
if (is_gpu_place(place)) { if (is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>(); ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
int nranks = Attr<int>("nranks"); int nranks = Attr<int>("nranks");

@ -24,7 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -114,7 +114,7 @@ template <ReduceType red_type, typename T>
class CReduceOpCUDAKernel : public framework::OpKernel<T> { class CReduceOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
@ -129,7 +129,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
int root = ctx.Attr<int>("root_id"); int root = ctx.Attr<int>("root_id");
auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_reducescatter_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,7 @@ template <typename T>
class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> { class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto in = ctx.Input<framework::Tensor>("X"); auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<framework::Tensor>("Out");
@ -49,7 +49,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
T* recv_buff = out->data<T>(); T* recv_buff = out->data<T>();
int dtype = platform::ToNCCLDataType(in->type()); int dtype = platform::ToNCCLDataType(in->type());
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_scatter_op.h" #include "paddle/fluid/operators/collective/c_scatter_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,7 @@ template <typename T>
class CScatterOpCUDAKernel : public framework::OpKernel<T> { class CScatterOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto x = ctx.Input<framework::LoDTensor>("X"); auto x = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out"); auto out = ctx.Output<framework::LoDTensor>("Out");
int numel = x->numel(); int numel = x->numel();
@ -53,7 +53,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
"The ring_id (%d) for c_scatter_op must be non-negative.", "The ring_id (%d) for c_scatter_op must be non-negative.",
ring_id)); ring_id));
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

@ -37,10 +37,14 @@ class CSyncCalcStreamOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Sync stream op can run on gpu place only for now.")); "Sync stream op can run on gpu place only for now."));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
auto dev_ctx = static_cast<platform::CUDADeviceContext*>( auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
#endif
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU.")); "PaddlePaddle should compile with GPU."));

@ -19,7 +19,7 @@ namespace framework {
class Scope; class Scope;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
@ -40,11 +40,15 @@ class CSyncCommStreamOp : public framework::OperatorBase {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Sync stream op can run on gpu place only for now.")); "Sync stream op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int ring_id = Attr<int>("ring_id"); int ring_id = Attr<int>("ring_id");
auto stream = auto stream =
platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU.")); "PaddlePaddle should compile with GPU."));

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/recv_v2_op.h" #include "paddle/fluid/operators/collective/recv_v2_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,8 @@ template <typename T>
class RecvOpV2CUDAKernel : public framework::OpKernel<T> { class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
NCCL_VERSION_CODE >= 2703
int rid = ctx.Attr<int>("ring_id"); int rid = ctx.Attr<int>("ring_id");
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
rid, 0, rid, 0,
@ -45,7 +46,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
framework::proto::VarType::Type type = framework::proto::VarType::Type type =
framework::proto::VarType::Type(data_type); framework::proto::VarType::Type(data_type);
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
@ -65,12 +66,21 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
// Recv the number of elements to receive first // Recv the number of elements to receive first
int numel = 0; int numel = 0;
int *numel_ptr = nullptr; int *numel_ptr = nullptr;
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&numel_ptr, sizeof(int)));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int))); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int)));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclRecv(static_cast<void *>(numel_ptr), 1, ncclInt, platform::dynload::ncclRecv(static_cast<void *>(numel_ptr), 1, ncclInt,
peer, comm->comm(), stream)); peer, comm->comm(), stream));
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpy(&numel, numel_ptr, sizeof(int), hipMemcpyDeviceToHost));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(&numel, numel_ptr, sizeof(int), cudaMemcpyDeviceToHost)); cudaMemcpy(&numel, numel_ptr, sizeof(int), cudaMemcpyDeviceToHost));
#endif
int rest_numel = 1; int rest_numel = 1;
for (int i = 1; i < out_dims.size(); ++i) { for (int i = 1; i < out_dims.size(); ++i) {

@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h" #include "paddle/fluid/operators/collective/send_v2_op.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
@ -26,7 +26,8 @@ template <typename T>
class SendOpV2CUDAKernel : public framework::OpKernel<T> { class SendOpV2CUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
NCCL_VERSION_CODE >= 2703
auto x = ctx.Input<framework::LoDTensor>("X"); auto x = ctx.Input<framework::LoDTensor>("X");
int numel = x->numel(); int numel = x->numel();
@ -41,7 +42,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
peer, 0, peer, 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The peer (%d) for send_v2 op must be non-negative.", peer)); "The peer (%d) for send_v2 op must be non-negative.", peer));
cudaStream_t stream = nullptr; gpuStream_t stream = nullptr;
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
@ -59,9 +60,15 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
// Send number of elements to the receiver, as the receiver may have // Send number of elements to the receiver, as the receiver may have
// no information of the Tensor size. // no information of the Tensor size.
int* numel_ptr = nullptr; int* numel_ptr = nullptr;
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&numel_ptr, sizeof(int)));
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpy(numel_ptr, &numel, sizeof(int), hipMemcpyHostToDevice));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int))); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int)));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(numel_ptr, &numel, sizeof(int), cudaMemcpyHostToDevice)); cudaMemcpy(numel_ptr, &numel, sizeof(int), cudaMemcpyHostToDevice));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
numel_ptr, 1, ncclInt, peer, comm->comm(), stream)); numel_ptr, 1, ncclInt, peer, comm->comm(), stream));

@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 0> {
auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place); auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place);
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx); reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
@ -58,7 +58,7 @@ struct StridedMemcpyFunctor<T, 1> {
auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place); auto& cpu_place = BOOST_GET_CONST(platform::CPUPlace, place);
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx); reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);

Loading…
Cancel
Save