|
|
|
@ -105,38 +105,36 @@ static size_t GetUniqueDeviceIdOfOp(const details::OpHandleBase &op) {
|
|
|
|
|
return dev_idx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This function tries to separate the original graph into multiple graphs, in
|
|
|
|
|
* which each graph would only run on single device. This is usually used to
|
|
|
|
|
* separate a data-parallel inference graph to multiple graphs on each device.
|
|
|
|
|
*
|
|
|
|
|
* The graph can be separated into multiple single device graphs if and only if:
|
|
|
|
|
*
|
|
|
|
|
* - the graph does not contain any ops related to multi-devices communication,
|
|
|
|
|
* such as allreduce, send, recv, sync_batch_norm, etc.
|
|
|
|
|
*
|
|
|
|
|
* - ops on different devices do not depend on each other. That is to say, the
|
|
|
|
|
* graph has several disconnected sub-graphs.
|
|
|
|
|
*/
|
|
|
|
|
std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
|
|
|
|
|
ir::Graph *graph) {
|
|
|
|
|
static bool IsDataParallelInferenceGraphImpl(
|
|
|
|
|
const ir::Graph &graph,
|
|
|
|
|
std::unordered_map<details::OpHandleBase *, size_t> *p_op_to_dev_idx,
|
|
|
|
|
size_t *p_place_num) {
|
|
|
|
|
auto &place_num = *p_place_num;
|
|
|
|
|
auto &op_to_dev_idx = *p_op_to_dev_idx;
|
|
|
|
|
|
|
|
|
|
auto clear_result = [&] {
|
|
|
|
|
place_num = 0;
|
|
|
|
|
op_to_dev_idx.clear();
|
|
|
|
|
return false;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
clear_result();
|
|
|
|
|
|
|
|
|
|
// If sub-block contains multi-devices ops, we cannot separate
|
|
|
|
|
if (ContainMultiDeviceOp(graph->OriginProgram(), 1)) {
|
|
|
|
|
return {};
|
|
|
|
|
if (ContainMultiDeviceOp(graph.OriginProgram(), 1)) {
|
|
|
|
|
return clear_result();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t place_num = 0;
|
|
|
|
|
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
|
|
|
|
|
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
|
|
|
|
|
if (op_handles.empty()) {
|
|
|
|
|
return {};
|
|
|
|
|
return clear_result();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::unordered_map<details::OpHandleBase *, size_t> op_to_dev_idx;
|
|
|
|
|
for (auto &op : op_handles) {
|
|
|
|
|
auto dev_idx = GetUniqueDeviceIdOfOp(*op);
|
|
|
|
|
if (dev_idx == kUndefinedDevIdx) {
|
|
|
|
|
VLOG(10) << "Op " << op->Name() << " is not determined";
|
|
|
|
|
return {};
|
|
|
|
|
return clear_result();
|
|
|
|
|
}
|
|
|
|
|
place_num = std::max(place_num, dev_idx + 1);
|
|
|
|
|
op_to_dev_idx[op] = dev_idx;
|
|
|
|
@ -148,7 +146,7 @@ std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
|
|
|
|
|
if (in_var->GeneratedOp()) {
|
|
|
|
|
auto iter = op_to_dev_idx.find(in_var->GeneratedOp());
|
|
|
|
|
if (iter == op_to_dev_idx.end() || iter->second != dev_idx) {
|
|
|
|
|
return {};
|
|
|
|
|
return clear_result();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -157,7 +155,7 @@ std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
|
|
|
|
|
for (auto &pending_op : out_var->PendingOps()) {
|
|
|
|
|
auto iter = op_to_dev_idx.find(pending_op);
|
|
|
|
|
if (iter == op_to_dev_idx.end() || iter->second != dev_idx) {
|
|
|
|
|
return {};
|
|
|
|
|
return clear_result();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -171,6 +169,36 @@ std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
|
|
|
|
|
"issue at https://github.com/PaddlePaddle/Paddle/issues/new. And "
|
|
|
|
|
"we will resolve it with high priority."));
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool IsDataParallelInferenceGraph(const ir::Graph &graph) {
|
|
|
|
|
size_t place_num;
|
|
|
|
|
std::unordered_map<details::OpHandleBase *, size_t> op_to_dev_idx;
|
|
|
|
|
return IsDataParallelInferenceGraphImpl(graph, &op_to_dev_idx, &place_num);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This function tries to separate the original graph into multiple graphs, in
|
|
|
|
|
* which each graph would only run on single device. This is usually used to
|
|
|
|
|
* separate a data-parallel inference graph to multiple graphs on each device.
|
|
|
|
|
*
|
|
|
|
|
* The graph can be separated into multiple single device graphs if and only if:
|
|
|
|
|
*
|
|
|
|
|
* - the graph does not contain any ops related to multi-devices communication,
|
|
|
|
|
* such as allreduce, send, recv, sync_batch_norm, etc.
|
|
|
|
|
*
|
|
|
|
|
* - ops on different devices do not depend on each other. That is to say, the
|
|
|
|
|
* graph has several disconnected sub-graphs.
|
|
|
|
|
*/
|
|
|
|
|
std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
|
|
|
|
|
ir::Graph *graph) {
|
|
|
|
|
size_t place_num;
|
|
|
|
|
std::unordered_map<details::OpHandleBase *, size_t> op_to_dev_idx;
|
|
|
|
|
if (!IsDataParallelInferenceGraphImpl(*graph, &op_to_dev_idx, &place_num)) {
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (place_num == 1) {
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
@ -182,8 +210,10 @@ std::vector<std::unique_ptr<ir::Graph>> TrySeparateToMultipleSingleDeviceGraphs(
|
|
|
|
|
g->Set(kGraphDepVars, new GraphDepVars());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto &op : op_handles) {
|
|
|
|
|
auto dev_idx = op_to_dev_idx.at(op);
|
|
|
|
|
for (auto &pair : op_to_dev_idx) {
|
|
|
|
|
auto *op = pair.first;
|
|
|
|
|
auto dev_idx = pair.second;
|
|
|
|
|
|
|
|
|
|
auto *ret_graph = graphs[dev_idx].get();
|
|
|
|
|
auto &ret_vars = ret_graph->Get<GraphVars>(kGraphVars)[0];
|
|
|
|
|
auto &ret_dummy_vars = ret_graph->Get<GraphDepVars>(kGraphDepVars);
|
|
|
|
|