Merge pull request #10017 from reyoung/feature/clean_pe

Clean MultiDevicesGraphBuilder
wangkuiyi-patch-2
Yu Yang 7 years ago committed by GitHub
commit 6c7daaf025
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -89,51 +89,99 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
bool change_forward = false; if (op->Type() == "send") {
// append send op if program is distributed trainer main program.
// always use the first device
CreateSendOp(&result, *op);
} else if (IsScaleLossOp(*op)) {
CreateScaleLossGradOp(&result);
is_forwarding = false;
} else {
CreateComputationalOps(&result, *op);
if (!is_forwarding) { if (!is_forwarding) {
// FIXME(yy): Do not hard code like this // Currently, we assume that once gradient is generated, it can be
if (op->OutputArgumentNames().size() == 1 && // broadcast, and each gradient is only broadcast once. But there are no
op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) { // other cases, for example, we need to adjust the gradient according to
continue; // Drop fill 1. for backward coeff; // the input when we get the gradient, which is not considered at
// present.
for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
InsertNCCLAllReduceOp(&result, og);
}
}
}
} }
} }
// append send op if program is distributed trainer main program. /*
// always use the first device Dependency graph has been constructed. However, there are still data
if (!is_forwarding && op->Type() == "send") { harzaeds need to be handled.
auto &p = places_[0]; */
auto *s = local_scopes_[0]; PolishGraphToSupportDataHazards(&result);
// FIXME(wuyi): send op always copy from GPU 0
result.ops_.emplace_back(new SendOpHandle(*op, s, p)); /*
// Create inputs for output on original place and no ssa output * Only variables should be the leaves of graph.
// is created for send op. */
CreateOpHandleIOs(&result, *op, p, 0); AddOutputToLeafOps(&result);
continue;
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
PrintGraphviz(*graph, sout);
VLOG(10) << sout.str();
} }
return std::unique_ptr<SSAGraph>(graph);
}
void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
SSAGraph *result, const std::string &og) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result->ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i]; auto &p = places_[i];
auto *s = local_scopes_[i]; auto &vars = result->vars_[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad.get());
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); auto var = new VarHandle(vars.size() - 1, i, og, p);
auto *op_handle = result.ops_.back().get(); vars.emplace_back(var);
CreateOpHandleIOs(&result, *op, p, i); op_handle->AddOutput(var);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
auto var_names = op->OutputArgumentNames(); bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
}
return is_pg_once;
}
if (is_forwarding) { void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) { for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle // Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p); auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
#else #else
auto *communication_dev_ctx = auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif #endif
op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p, auto *op_handle =
communication_dev_ctx); new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
result.ops_.emplace_back(op_handle); places_[i], communication_dev_ctx);
result->ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale // FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators. // factor. So it does not depend on any other operators.
@ -141,73 +189,37 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
// loss->pending_ops_.emplace_back(op_handle); // loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss); // op_handle->inputs_.emplace_back(loss);
CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i); CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
change_forward = true; i);
}
} }
} }
if (change_forward) { void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
is_forwarding = false; const OpDesc &op) const {
for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx];
result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
CreateOpHandleIOs(result, op, p, scope_idx);
} }
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at present.
for (auto &og : var_names) {
if (grad_names_.count(og) != 0 &&
og_has_been_broadcast.count(og) == 0) { // is param grad
// Insert NCCL AllReduce Op
og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result.vars_[i][og];
if (vars.empty()) { // This device has no data. continue.
continue;
} }
auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad.get());
auto var = new VarHandle(vars.size() - 1, i, og, p); void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
vars.emplace_back(var); const OpDesc &op) const {
op_handle->AddOutput(var); auto &p = places_[0];
} auto *s = local_scopes_[0];
#else // FIXME(wuyi): send op always copy from GPU 0
PADDLE_ENFORCE("Not implemented"); result->ops_.emplace_back(new SendOpHandle(op, s, p));
#endif // Create inputs for output on original place and no ssa output
} // is created for send op.
} CreateOpHandleIOs(result, op, p, 0);
}
} }
/* bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
Dependency graph has been constructed. However, there are still data // FIXME(yy): Do not hard code like this
harzaeds need to be handled. return op.OutputArgumentNames().size() == 1 &&
*/ op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
PolishGraphToSupportDataHazards(&result);
/*
* Only variables should be the leaves of graph.
*/
AddOutputToLeafOps(&result);
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
PrintGraphviz(*graph, sout);
VLOG(10) << sout.str();
} }
return std::unique_ptr<SSAGraph>(graph);
} // namespace details
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_; platform::NCCLContextMap *nccl_ctxs_;
#endif #endif
bool IsScaleLossOp(const OpDesc &op) const;
void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
void CreateScaleLossGradOp(SSAGraph *result) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework

Loading…
Cancel
Save