diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 75143b9a1a..afd0b70c29 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar( &VariableVisitor::GetMutableTensor(out_var)); } }); + for (auto &p : places_) { + nccl_ctxs_->DevCtx(p)->Wait(); + } #else PADDLE_THROW("CUDA is not enabled."); #endif diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d14ed36e28..216fb66c03 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -278,12 +278,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, #else const bool use_cuda) const { #endif - VLOG(3) << "apply all passes"; + VLOG(1) << "apply all passes"; // Create a default one if not finalized by user. CreatePassesFromStrategy(false); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type(); + VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -349,11 +349,11 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, continue; } } - VLOG(3) << "Start Apply Pass " << pass->Type(); + VLOG(1) << "Start Apply Pass " << pass->Type(); graph = pass->Apply(graph); - VLOG(3) << "Finish Apply Pass " << pass->Type(); + VLOG(1) << "Finish Apply Pass " << pass->Type(); } - VLOG(3) << "All Passes Applied"; + VLOG(1) << "All Passes Applied"; return graph; } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 929cb51b84..47409b89bc 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -98,7 +98,7 @@ struct BuildStrategy { // faster. Because fusing broadcast OP equals delaying the execution of all // broadcast Ops, in this case, all nccl streams are used only for reduce // operations for a period of time. - bool fuse_broadcast_ops_{false}; + bool fuse_broadcast_ops_{true}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc index 457de41c8f..8355764aa6 100644 --- a/paddle/fluid/framework/ir/pass_builder.cc +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -21,7 +21,7 @@ namespace framework { namespace ir { std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { - VLOG(3) << "Append " << pass_type; + VLOG(1) << "Append " << pass_type; auto pass = ir::PassRegistry::Instance().Get(pass_type); passes_.emplace_back(pass.release()); return passes_.back();