|
|
|
|
@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < local_scopes_.size(); ++i) {
|
|
|
|
|
auto *s = local_scopes_[i];
|
|
|
|
|
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
|
|
|
|
|
|
|
|
|
|
auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
|
|
|
|
|
auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
|
|
|
|
|
lod_tensors.emplace_back(lod_tensor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
} else { // Special handle CPU only Operator's gradient. Like CRF
|
|
|
|
|
auto &trg =
|
|
|
|
|
*this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
|
|
|
|
|
auto &trg = *this->local_scopes_[0]
|
|
|
|
|
->FindVar(kLocalExecScopeName)
|
|
|
|
|
->Get<Scope *>()
|
|
|
|
|
->Var()
|
|
|
|
|
->GetMutable<framework::LoDTensor>();
|
|
|
|
|
|
|
|
|
|
// Reduce All Tensor to trg in CPU
|
|
|
|
|
ReduceLoDTensor func(lod_tensors, &trg);
|
|
|
|
|
VisitDataType(ToDataType(lod_tensors[0].type()), func);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < local_scopes_.size(); ++i) {
|
|
|
|
|
auto &scope = local_scopes_[i];
|
|
|
|
|
auto &scope =
|
|
|
|
|
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
|
|
|
|
|
auto &p = places_[i];
|
|
|
|
|
auto *var = scope->FindVar(var_name);
|
|
|
|
|
auto *var = scope.FindVar(var_name);
|
|
|
|
|
auto *dev_ctx = dev_ctxes_[p];
|
|
|
|
|
|
|
|
|
|
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
|
|
|
|
|
|