Skip to content

Commit

Permalink
Replace platform::dynload in fluid/operators (#63833)
Browse files Browse the repository at this point in the history
  • Loading branch information
co63oc authored Apr 25, 2024
1 parent 86a3e5f commit 0455cd9
Show file tree
Hide file tree
Showing 30 changed files with 478 additions and 485 deletions.
8 changes: 4 additions & 4 deletions paddle/fluid/operators/collective/alltoall_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,15 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
comm_ctx->GroupEnd();
VLOG(3) << "new comm_context_manager has rid " << ring_id;
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGroupStart());
for (auto i = 0; i < nranks; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclSend(
send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRecv(
recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
offset += send_numel;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGroupEnd());
VLOG(3) << "old NCCLCommContext has rid " << ring_id;
}
#else
Expand Down
14 changes: 7 additions & 7 deletions paddle/fluid/operators/collective/barrier_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,13 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
// should ExecutionContext for calc stream.
auto stream = ctx.cuda_device_context().stream();
ncclRedOp_t nccl_red_type = ncclSum;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
nccl_red_type,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
nccl_red_type,
comm->comm(),
stream));
platform::GpuStreamSync(stream);
VLOG(3) << "old NCCLCommContext has rid " << rid;
}
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/operators/collective/c_allgather_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,12 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
comm_ctx->AllGather(out, *in, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllGather(send_buff,
recv_buff,
send_numel,
static_cast<ncclDataType_t>(dtype),
comm->comm(),
stream));
phi::dynload::ncclAllGather(send_buff,
recv_buff,
send_numel,
static_cast<ncclDataType_t>(dtype),
comm->comm(),
stream));
}

#else
Expand Down
14 changes: 7 additions & 7 deletions paddle/fluid/operators/collective/c_allreduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,13 +427,13 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
if (comm_ctx) {
comm_ctx->AllReduce(out, *in, nccl_red_type, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
nccl_red_type,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(sendbuff,
recvbuff,
numel,
dtype,
nccl_red_type,
comm->comm(),
stream));
}
#else
PADDLE_THROW(phi::errors::PreconditionNotMet(
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/collective/c_broadcast_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
if (root == comm->rank()) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
numel,
dtype,
Expand All @@ -71,7 +71,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
static_cast<phi::DenseTensor*>(out));
}
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBcast(
out->data<T>(), numel, dtype, root, comm->comm(), stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
<< common::product(out->dims());
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/operators/collective/c_concat_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,12 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
comm_ctx->AllGather(&temp_out, *x, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllGather(send_buff,
recv_buff,
send_numel,
static_cast<ncclDataType_t>(dtype),
comm->comm(),
stream));
phi::dynload::ncclAllGather(send_buff,
recv_buff,
send_numel,
static_cast<ncclDataType_t>(dtype),
comm->comm(),
stream));
}
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace operators {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
for (auto& nccl_id : *nccl_ids) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
}
}

Expand Down
16 changes: 8 additions & 8 deletions paddle/fluid/operators/collective/c_reduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,14 +321,14 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
if (comm_ctx) {
comm_ctx->Reduce(out, *in, nccl_red_type, root, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(sendbuff,
recvbuff,
numel,
dtype,
nccl_red_type,
root,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(sendbuff,
recvbuff,
numel,
dtype,
nccl_red_type,
root,
comm->comm(),
stream));
}
#else
PADDLE_ENFORCE_EQ(
Expand Down
16 changes: 8 additions & 8 deletions paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,14 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
if (comm_ctx) {
comm_ctx->ReduceScatter(out, *in, ncclSum, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
send_buff,
recv_buff,
recv_numel,
static_cast<ncclDataType_t>(dtype),
ncclSum,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::ncclReduceScatter(send_buff,
recv_buff,
recv_numel,
static_cast<ncclDataType_t>(dtype),
ncclSum,
comm->comm(),
stream));
}
#else
PADDLE_THROW(phi::errors::PreconditionNotMet(
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/collective/c_scatter_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
}
} else {
if (root_id == comm->rank()) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
numel,
dtype,
Expand All @@ -137,7 +137,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
*platform::DeviceContextPool::Instance().Get(place),
static_cast<phi::DenseTensor*>(&temp));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBcast(
out_ptr, numel, dtype, root_id, comm->comm(), stream));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
} else {
void* logits_max_buff = logits_max.mutable_data<T>(place);

PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
logits_max_buff,
logits_max_buff,
logits_max.numel(),
Expand Down Expand Up @@ -276,7 +276,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream);
} else {
void* predict_logits_buff = predicted_logits.data();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
predict_logits_buff,
predict_logits_buff,
predicted_logits.numel(),
Expand All @@ -302,7 +302,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
} else {
void* sum_exp_logits_buff = sum_exp_logits.data();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
sum_exp_logits_buff,
sum_exp_logits_buff,
sum_exp_logits.numel(),
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/collective/gen_nccl_id_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ namespace operators {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
for (auto& nccl_id : *nccl_ids) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
}
}

Expand Down
32 changes: 16 additions & 16 deletions paddle/fluid/operators/collective/global_gather_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,30 +165,30 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
auto send_buf = x->data<T>();
auto recv_buf = out->data<T>();
for (auto i = 0; i < n_expert; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGroupStart());
for (auto j = 0; j < nranks; ++j) {
int idx = i + j * n_expert;
if (cpu_global_count_data[idx]) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
send_buf + send_ptr * in_feat,
cpu_global_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::ncclSend(send_buf + send_ptr * in_feat,
cpu_global_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
send_ptr += cpu_global_count_data[idx];
}
if (cpu_local_count_data[idx]) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
recv_buf + expert_ptr[idx] * in_feat,
cpu_local_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::ncclRecv(recv_buf + expert_ptr[idx] * in_feat,
cpu_local_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
}
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGroupEnd());
}
}
#else
Expand Down
32 changes: 16 additions & 16 deletions paddle/fluid/operators/collective/global_scatter_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,30 +173,30 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
auto recv_buf = out->data<T>();

for (auto i = 0; i < n_expert; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGroupStart());
for (auto j = 0; j < nranks; ++j) {
int idx = i + j * n_expert;
if (cpu_local_count_data[idx]) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
send_buf + expert_ptr[idx] * in_feat,
cpu_local_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::ncclSend(send_buf + expert_ptr[idx] * in_feat,
cpu_local_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
}
if (cpu_global_count_data[idx]) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
recv_buf + recv_ptr * in_feat,
cpu_global_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::ncclRecv(recv_buf + recv_ptr * in_feat,
cpu_global_count_data[idx] * in_feat,
dtype,
j,
comm->comm(),
stream));
recv_ptr += cpu_global_count_data[idx];
}
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGroupEnd());
}
}

Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/operators/collective/partial_allgather_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,12 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
const T* send_buff = in->data<T>() + offset;
T* recv_buff = out->data<T>();
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllGather(send_buff,
recv_buff,
send_numel,
static_cast<ncclDataType_t>(dtype),
comm->comm(),
stream));
phi::dynload::ncclAllGather(send_buff,
recv_buff,
send_numel,
static_cast<ncclDataType_t>(dtype),
comm->comm(),
stream));
}
}
#else
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/operators/collective/partial_recv_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,12 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
comm_ctx->Recv(&recv_buf, recv_numel, peer, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclRecv(out->data<T>() + offset,
recv_numel,
dtype,
peer,
comm->comm(),
stream));
phi::dynload::ncclRecv(out->data<T>() + offset,
recv_numel,
dtype,
peer,
comm->comm(),
stream));
}
VLOG(3) << "rank " << rank << " recv " << recv_numel << " from offset["
<< offset << "] from " << peer;
Expand Down
13 changes: 6 additions & 7 deletions paddle/fluid/operators/collective/partial_send_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,12 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {

comm_ctx->Send(send_buf, send_numel, peer, stream);
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclSend(x->data<T>() + offset,
send_numel,
dtype,
peer,
comm->comm(),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclSend(x->data<T>() + offset,
send_numel,
dtype,
peer,
comm->comm(),
stream));
}

VLOG(3) << "rank " << rank << " send " << send_numel << " from offset["
Expand Down
Loading

0 comments on commit 0455cd9

Please sign in to comment.