Support dynamic graph distributed (#28997)
* add reducer * refine envent for memorycopy * add concat&split for allreduce * apply concat & split for fuse tensor * fix nccl dep * fix the untest, compile problem and ddp initialize problem * fix untest for mac & add some comments & solve the repeated param in sublayers * fix untest for windows & fix documentmusl/disable_test_yolov3_temporarily
parent
7e5e9934fe
commit
e2d01eb650
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,225 @@
|
||||
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "paddle/fluid/framework/data_type.h"
|
||||
#include "paddle/fluid/imperative/layer.h"
|
||||
#include "paddle/fluid/imperative/variable_wrapper.h"
|
||||
#include "paddle/fluid/memory/memory.h"
|
||||
|
||||
#if defined(PADDLE_WITH_NCCL)
|
||||
#include "paddle/fluid/imperative/all_reduce.h"
|
||||
#include "paddle/fluid/operators/math/concat_and_split.h"
|
||||
#include "paddle/fluid/operators/strided_memcpy.h"
|
||||
#include "paddle/fluid/platform/cuda_resource_pool.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace imperative {
|
||||
|
||||
#if defined(PADDLE_WITH_NCCL)
|
||||
template <typename T>
|
||||
void ConcatTensorsForAllReduce(
|
||||
const platform::CUDADeviceContext& context,
|
||||
const std::vector<framework::Tensor>& dense_tensors_,
|
||||
framework::Variable* p_dense_contents) {
|
||||
operators::math::ConcatFunctor<platform::CUDADeviceContext, T>
|
||||
concat_functor_;
|
||||
concat_functor_(context, dense_tensors_, 0,
|
||||
p_dense_contents->GetMutable<framework::LoDTensor>());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SplitTensorsForAllReduce(const platform::CUDADeviceContext& context,
|
||||
framework::Variable* p_dense_contents,
|
||||
std::vector<framework::Tensor>* p_dense_tensors) {
|
||||
auto* in = p_dense_contents->GetMutable<framework::LoDTensor>();
|
||||
std::vector<framework::Tensor*> outs;
|
||||
std::vector<const framework::Tensor*> shape_refer;
|
||||
|
||||
outs.reserve(p_dense_tensors->size());
|
||||
shape_refer.reserve(p_dense_tensors->size());
|
||||
|
||||
for (auto& tensor : *p_dense_tensors) {
|
||||
outs.emplace_back(&tensor);
|
||||
shape_refer.emplace_back(&tensor);
|
||||
}
|
||||
// Sometimes direct copies will be faster
|
||||
if (p_dense_tensors->size() < 10) {
|
||||
operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
|
||||
} else {
|
||||
operators::math::SplitFunctor<platform::CUDADeviceContext, T>
|
||||
split_functor_;
|
||||
split_functor_(context, *in, shape_refer, 0, &outs);
|
||||
}
|
||||
}
|
||||
|
||||
class Group {
|
||||
public:
|
||||
// Here, we use dense_contents_ & sparse_contents_ to
|
||||
// achieve the tensor fuse. When is_sparse_ is true, sparse_contents_ work,
|
||||
// conversely, dense_contents_ works. It is mutex relationship.
|
||||
framework::Variable dense_contents_;
|
||||
framework::Variable* sparse_contents_ = nullptr;
|
||||
bool is_sparse_ = false;
|
||||
|
||||
// for concat kernel
|
||||
std::vector<framework::Tensor> dense_tensors_;
|
||||
|
||||
std::vector<size_t> length_;
|
||||
// Global indices of participating variables in the group
|
||||
std::vector<size_t> variable_indices_;
|
||||
|
||||
// Number of params that haven't been ready. When it is 0, it means
|
||||
// the group is ready.
|
||||
size_t pending_ = -1;
|
||||
|
||||
// external message of group
|
||||
framework::proto::VarType::Type dtype_;
|
||||
|
||||
// context is used to select the stream for concat
|
||||
void ConcatTensors(const platform::CUDADeviceContext& context) {
|
||||
switch (dtype_) {
|
||||
case framework::proto::VarType::FP16:
|
||||
ConcatTensorsForAllReduce<platform::float16>(context, dense_tensors_,
|
||||
&dense_contents_);
|
||||
break;
|
||||
case framework::proto::VarType::FP32:
|
||||
ConcatTensorsForAllReduce<float>(context, dense_tensors_,
|
||||
&dense_contents_);
|
||||
break;
|
||||
case framework::proto::VarType::FP64:
|
||||
ConcatTensorsForAllReduce<double>(context, dense_tensors_,
|
||||
&dense_contents_);
|
||||
break;
|
||||
default:
|
||||
PADDLE_THROW(platform::errors::Unimplemented(
|
||||
"Data type (%s) is not supported when it concats tensors for "
|
||||
"allreduce.",
|
||||
framework::DataTypeToString(dtype_)));
|
||||
}
|
||||
}
|
||||
|
||||
// context is used to select the stream for split
|
||||
void SplitTensors(const platform::CUDADeviceContext& context) {
|
||||
switch (dtype_) {
|
||||
case framework::proto::VarType::FP16:
|
||||
SplitTensorsForAllReduce<platform::float16>(context, &dense_contents_,
|
||||
&dense_tensors_);
|
||||
break;
|
||||
case framework::proto::VarType::FP32:
|
||||
SplitTensorsForAllReduce<float>(context, &dense_contents_,
|
||||
&dense_tensors_);
|
||||
break;
|
||||
case framework::proto::VarType::FP64:
|
||||
SplitTensorsForAllReduce<double>(context, &dense_contents_,
|
||||
&dense_tensors_);
|
||||
break;
|
||||
default:
|
||||
PADDLE_THROW(platform::errors::Unimplemented(
|
||||
"Data type (%s) is not supported when it splits tensors for "
|
||||
"allreduce.",
|
||||
framework::DataTypeToString(dtype_)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct VariableIndex {
|
||||
// record the index in groups_
|
||||
size_t group_index;
|
||||
size_t inside_group_index;
|
||||
};
|
||||
|
||||
class Reducer {
|
||||
public:
|
||||
explicit Reducer(
|
||||
const std::vector<std::shared_ptr<imperative::VarBase>>& vars,
|
||||
const std::vector<std::vector<size_t>>& group_indices,
|
||||
const std::vector<bool>& is_sparse_gradient,
|
||||
std::shared_ptr<imperative::ParallelContext> parallel_ctx);
|
||||
|
||||
virtual ~Reducer() {}
|
||||
|
||||
void InitializeGroups(const std::vector<std::vector<size_t>>& group_indices);
|
||||
|
||||
int64_t InitializeDenseGroups(const std::vector<size_t>& variable_indices_,
|
||||
Group* p_group);
|
||||
|
||||
void PrepareForBackward();
|
||||
|
||||
void AddDistHook(VariableWrapper* var_warpper,
|
||||
const VariableIndex& var_index);
|
||||
|
||||
void MarkVariableReady(const VariableIndex& var_index,
|
||||
VariableWrapper* var_warpper);
|
||||
|
||||
void MarkGroupReady(size_t group_index);
|
||||
|
||||
void FinalizeBackward();
|
||||
|
||||
void ReleaseReducer();
|
||||
|
||||
// Reducer Singleton
|
||||
static std::shared_ptr<Reducer> SetInstance(
|
||||
const std::vector<std::shared_ptr<imperative::VarBase>>& vars,
|
||||
const std::vector<std::vector<size_t>>& group_indices,
|
||||
const std::vector<bool>& is_sparse_gradient,
|
||||
std::shared_ptr<imperative::ParallelContext> parallel_ctx) {
|
||||
if (NULL == s_instance_) {
|
||||
s_instance_.reset(new paddle::imperative::Reducer(
|
||||
vars, group_indices, is_sparse_gradient, parallel_ctx));
|
||||
}
|
||||
return s_instance_;
|
||||
}
|
||||
|
||||
static std::shared_ptr<Reducer> GetInstance() {
|
||||
PADDLE_ENFORCE_EQ(
|
||||
s_instance_ != NULL, true,
|
||||
platform::errors::InvalidArgument("Reducer is not initialized."));
|
||||
return s_instance_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<imperative::VarBase>> vars_;
|
||||
std::vector<std::vector<size_t>> group_indices_;
|
||||
static std::shared_ptr<Reducer> s_instance_;
|
||||
std::vector<Group> groups_;
|
||||
size_t next_group_ = 0;
|
||||
platform::Place place_;
|
||||
std::once_flag once_flag_;
|
||||
std::vector<bool> is_sparse_gradient_;
|
||||
std::shared_ptr<imperative::ParallelContext> parallel_ctx_;
|
||||
|
||||
std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
|
||||
std::shared_ptr<platform::CudaEventObject> comm_enent_;
|
||||
cudaStream_t compute_stream_;
|
||||
cudaStream_t comm_stream_;
|
||||
};
|
||||
|
||||
std::vector<std::vector<size_t>> AssignGroupBySize(
|
||||
const std::vector<std::shared_ptr<imperative::VarBase>>& tensors,
|
||||
const std::vector<bool>& is_sparse_gradient,
|
||||
const std::vector<size_t>& group_size_limits);
|
||||
#endif
|
||||
|
||||
} // namespace imperative
|
||||
} // namespace paddle
|
@ -0,0 +1,56 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.dygraph.nn import Embedding
|
||||
from paddle.fluid.dygraph.base import to_variable
|
||||
|
||||
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
|
||||
from parallel_dygraph_sparse_embedding import SimpleNet, fake_sample_reader, TestSparseEmbedding
|
||||
|
||||
# global configs
|
||||
batch_size = 4
|
||||
batch_num = 200
|
||||
hidden_size = 10
|
||||
vocab_size = 1000
|
||||
num_steps = 3
|
||||
init_scale = 0.1
|
||||
|
||||
|
||||
class TestSparseEmbeddingFP64(TestSparseEmbedding):
|
||||
def get_model(self):
|
||||
model = SimpleNet(
|
||||
hidden_size=hidden_size,
|
||||
vocab_size=vocab_size,
|
||||
num_steps=num_steps,
|
||||
init_scale=init_scale,
|
||||
is_sparse=True,
|
||||
dtype="float64")
|
||||
|
||||
train_reader = paddle.batch(
|
||||
fake_sample_reader(), batch_size=batch_size, drop_last=True)
|
||||
|
||||
optimizer = fluid.optimizer.SGD(learning_rate=0.001,
|
||||
parameter_list=model.parameters())
|
||||
|
||||
return model, train_reader, optimizer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runtime_main(TestSparseEmbeddingFP64)
|
@ -0,0 +1,160 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import contextlib
|
||||
import unittest
|
||||
import numpy as np
|
||||
import six
|
||||
import unittest
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dygraph
|
||||
from paddle.fluid.dygraph.nn import Linear
|
||||
import paddle.fluid.core as core
|
||||
from paddle.fluid.optimizer import SGDOptimizer
|
||||
|
||||
|
||||
class MLP(fluid.Layer):
|
||||
def __init__(self, param_attr=None, bias_attr=None):
|
||||
super(MLP, self).__init__()
|
||||
|
||||
self._linear1 = Linear(784, 10)
|
||||
self._linear2 = Linear(10, 10)
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._linear1(inputs)
|
||||
y = self._linear2(y)
|
||||
return y
|
||||
|
||||
|
||||
class TestDataParallelGroup(unittest.TestCase):
|
||||
def create_varbase(self, dtype, shape,
|
||||
type=core.VarDesc.VarType.LOD_TENSOR):
|
||||
return core.VarBase(dtype, shape, "", type, True)
|
||||
|
||||
def test_construct_group0(self):
|
||||
# one dtype & one limit capability
|
||||
var_list = []
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
|
||||
var_list.append(
|
||||
self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
|
||||
res = core.assign_group_by_size(var_list, [False, False, False, False],
|
||||
[400])
|
||||
self.assertEqual([[0], [1], [2], [3]], res)
|
||||
|
||||
def test_construct_group1(self):
|
||||
# multi dtype & one limit capability
|
||||
var_list = []
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
res = core.assign_group_by_size(
|
||||
var_list, [False, False, False, False, False, False], [400])
|
||||
self.assertEqual([[0, 2], [1, 3], [4], [5]], res)
|
||||
|
||||
def test_construct_group2(self):
|
||||
# one dtype & multi limit capability
|
||||
var_list = []
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
|
||||
res = core.assign_group_by_size(var_list, [False, False, False, False],
|
||||
[400, 800])
|
||||
self.assertEqual([[0], [1, 2], [3]], res)
|
||||
|
||||
def test_construct_group3(self):
|
||||
# multi dtype & multi limit capability
|
||||
var_list = []
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
res = core.assign_group_by_size(
|
||||
var_list, [False, False, False, False, False, False], [200, 400])
|
||||
self.assertEqual([[0], [1], [2, 4], [3, 5]], res)
|
||||
|
||||
def test_construct_group4(self):
|
||||
# multi dtype & zero limit capability
|
||||
var_list = []
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
res = core.assign_group_by_size(
|
||||
var_list, [False, False, False, False, False, False], [0])
|
||||
self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
|
||||
|
||||
def test_construct_group5(self):
|
||||
# multi dtype & infinite capability
|
||||
var_list = []
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
res = core.assign_group_by_size(
|
||||
var_list, [False, False, False, False, False, False], [10000])
|
||||
self.assertEqual([[0, 2, 4], [1, 3, 5]], res)
|
||||
|
||||
def test_construct_group6(self):
|
||||
# multi dtype & limit capability & multi tensor type
|
||||
var_list = []
|
||||
var_list.append(
|
||||
self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
|
||||
core.VarDesc.VarType.SELECTED_ROWS))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(
|
||||
self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
|
||||
core.VarDesc.VarType.SELECTED_ROWS))
|
||||
res = core.assign_group_by_size(
|
||||
var_list, [True, False, False, False, False, True], [400])
|
||||
self.assertEqual([[0], [1, 3], [2, 4], [5]], res)
|
||||
|
||||
def test_construct_group7(self):
|
||||
# multi dtype & multi limit capability & multi tensor type
|
||||
var_list = []
|
||||
var_list.append(
|
||||
self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
|
||||
core.VarDesc.VarType.SELECTED_ROWS))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
|
||||
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
|
||||
var_list.append(
|
||||
self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
|
||||
core.VarDesc.VarType.SELECTED_ROWS))
|
||||
res = core.assign_group_by_size(
|
||||
var_list, [True, False, False, False, False, True], [200, 400])
|
||||
self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue