merge develop

wangkuiyi-patch-2
dzhwinter 7 years ago
commit 7a517dc93e

@ -815,3 +815,8 @@ zeros
.. autofunction:: paddle.fluid.layers.zeros
:noindex:
topk
----
.. autofunction:: paddle.fluid.layers.topk
:noindex:

@ -0,0 +1 @@
../../v2/dev/contribute_to_paddle_cn.md

@ -0,0 +1 @@
../../v2/dev/contribute_to_paddle_en.md

@ -4,6 +4,8 @@
.. toctree::
:maxdepth: 1
contribute_to_paddle_cn.md
write_docs_cn.md
api_doc_std_cn.md
new_op_cn.md
new_op_kernel.md

@ -4,6 +4,8 @@ Development
.. toctree::
:maxdepth: 1
contribute_to_paddle_en.md
write_docs_en.md
api_doc_std_en.md
new_op_en.md
new_op_kernel.md

@ -0,0 +1 @@
../../v2/dev/write_docs_cn.rst

@ -0,0 +1 @@
../../v2/dev/write_docs_en.rst

@ -6,7 +6,43 @@ Data Reader Interface
DataTypes
=========
.. automodule:: paddle.v2.data_type
.. autofunction:: paddle.v2.data_type.dense_array
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_non_value_slot
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_value_slot
:noindex:
.. autoclass:: paddle.v2.data_type.InputType
:members:
:noindex:

File diff suppressed because it is too large Load Diff

@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(channel_test SRCS channel_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op

@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
auto *var = static_cast<VarHandle *>(input);
var->generated_op_->Wait(cpu_ctx);
}
tensors_.resize(inputs_.size());
auto *var = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var->name_;
auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var_handle->name_;
platform::CPUPlace cpu;
auto &scopes = *local_scopes_;
for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i];
auto &t = scope->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(var_name)
->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) {
auto *var =
scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_name);
auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
dev_ctxes_[t.place()]->Wait();
dev_ctxes_.at(t.place())->Wait();
#endif
} else {
tensors_[i].ShareDataWith(t);

@ -89,101 +89,25 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
bool change_forward = false;
if (!is_forwarding) {
// FIXME(yy): Do not hard code like this
if (op->OutputArgumentNames().size() == 1 &&
op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
continue; // Drop fill 1. for backward coeff;
}
}
// append send op if program is distributed trainer main program.
// always use the first device
if (!is_forwarding && op->Type() == "send") {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result.ops_.emplace_back(new SendOpHandle(*op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(&result, *op, p, 0);
continue;
}
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i];
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get();
CreateOpHandleIOs(&result, *op, p, i);
auto var_names = op->OutputArgumentNames();
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
communication_dev_ctx);
result.ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// VarHandle *loss = GetVarHandle(loss_var_name, place);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
change_forward = true;
}
}
}
if (change_forward) {
if (op->Type() == "send") {
// append send op if program is distributed trainer main program.
// always use the first device
CreateSendOp(&result, *op);
} else if (IsScaleLossOp(*op)) {
CreateScaleLossGradOp(&result);
is_forwarding = false;
}
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at present.
for (auto &og : var_names) {
if (grad_names_.count(og) != 0 &&
og_has_been_broadcast.count(og) == 0) { // is param grad
// Insert NCCL AllReduce Op
og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result.vars_[i][og];
if (vars.empty()) { // This device has no data. continue.
continue;
}
auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad.get());
auto var = new VarHandle(vars.size() - 1, i, og, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
} else {
CreateComputationalOps(&result, *op);
if (!is_forwarding) {
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at
// present.
for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
InsertNCCLAllReduceOp(&result, og);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
}
}
@ -207,7 +131,95 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
}
return std::unique_ptr<SSAGraph>(graph);
} // namespace details
}
void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
SSAGraph *result, const std::string &og) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result->ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result->vars_[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad.get());
auto var = new VarHandle(vars.size() - 1, i, og, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
}
return is_pg_once;
}
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
auto *op_handle =
new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
places_[i], communication_dev_ctx);
result->ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// VarHandle *loss = GetVarHandle(loss_var_name, place);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
i);
}
}
void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
const OpDesc &op) const {
for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx];
result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
CreateOpHandleIOs(result, op, p, scope_idx);
}
}
void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
const OpDesc &op) const {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result->ops_.emplace_back(new SendOpHandle(op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(result, op, p, 0);
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
// FIXME(yy): Do not hard code like this
return op.OutputArgumentNames().size() == 1 &&
op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
}
} // namespace details
} // namespace framework
} // namespace paddle

@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_;
#endif
bool IsScaleLossOp(const OpDesc &op) const;
void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
void CreateScaleLossGradOp(SSAGraph *result) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
};
} // namespace details
} // namespace framework

@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *s = local_scopes_[i];
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
lod_tensors.emplace_back(lod_tensor);
}
@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
});
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg =
*this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
auto &trg = *this->local_scopes_[0]
->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->Var()
->GetMutable<framework::LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &scope = local_scopes_[i];
auto &scope =
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i];
auto *var = scope->FindVar(var_name);
auto *var = scope.FindVar(var_name);
auto *dev_ctx = dev_ctxes_[p];
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {

@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
void ScaleLossGradOpHandle::RunImpl() {
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
float *tmp =
scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
make_ddim({1}), place_);
float *tmp = local_scope.FindVar(var_name)
->GetMutable<LoDTensor>()
->mutable_data<float>(make_ddim({1}), place_);
if (platform::is_cpu_place(place_)) {
*tmp = coeff_;

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <unordered_set>
#include <vector>
@ -69,8 +70,7 @@ class GradOpDescMakerBase {
" for input argument with a list of variables, "
" drop_empty_grad is not allowed because it makes"
" the correspondence bewteen a variable and its gradient"
" ambiguous. Use REGISTER_OP_EX to register the op"
" or call InputGrad(?,false) in GradOpDescMaker."
" ambiguous."
" Op type %s",
fwd_op_.Type());

@ -16,6 +16,8 @@ limitations under the License. */
#include <algorithm>
#include <atomic>
#include <string>
#include <tuple>
#include <type_traits>
#include <typeinfo>
#include <unordered_map>
@ -141,36 +143,6 @@ class OpKernelRegistrar : public Registrar {
return 0; \
}
/**
* Macro to register Operator. When the input is duplicable, you should
* use REGISTER_OP_EX with drop_empty_grad=false instead.
*/
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \
REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class, true)
// When an argument is duplicable, we need to use this version.
// Perhaps we can omit DropEmptyIG template parameter and
// only have one version of REGISTER_OP.
#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class, drop_empty_grad) \
REGISTER_OPERATOR(grad_op_type, grad_op_class); \
class _GradOpDescMaker_##grad_op_type##_ \
: public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
using ::paddle::framework::DefaultGradOpDescMaker< \
drop_empty_grad>::DefaultGradOpDescMaker; \
\
protected: \
virtual std::string GradOpType() const { return #grad_op_type; } \
}; \
REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
op_maker_class);
#define REGISTER_OP_WITH_KERNEL(op_type, ...) \
REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
##__VA_ARGS__)
#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
REGISTER_OPERATOR(op_type, op_class, op_maker_class)

@ -44,6 +44,7 @@ class ParallelExecutorPrivate {
#endif
std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
bool own_local_scope;
};
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@ -63,13 +64,16 @@ ParallelExecutor::ParallelExecutor(
// Step 1. Bcast the params to devs.
// Create local scopes
if (local_scopes.empty()) {
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope());
member_->own_local_scope = true;
member_->local_scopes_.emplace_back(member_->global_scope_);
for (size_t i = 1; i < member_->places_.size(); ++i) {
member_->local_scopes_.emplace_back(&scope->NewScope());
}
} else {
member_->own_local_scope = false;
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]);
member_->local_scopes_.emplace_back(local_scopes[i]);
}
}
@ -159,7 +163,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name) {
platform::RecordBlock b(0);
// Create local scopes.
for (auto &scope : member_->local_scopes_) {
for (auto it = member_->local_scopes_.rbegin();
it != member_->local_scopes_.rend(); ++it) {
auto &scope = *it;
Scope &local_scope = scope->NewScope();
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
&local_scope;
@ -173,7 +179,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
} else {
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
}
}
@ -228,5 +234,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
ParallelExecutor::~ParallelExecutor() {
if (member_->own_local_scope) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
}
}
}
} // namespace framework
} // namespace paddle

@ -42,6 +42,8 @@ class ParallelExecutor {
const std::vector<Scope*>& local_scopes,
bool allow_op_delay);
~ParallelExecutor();
std::vector<Scope*>& GetLocalScopes();
/**

@ -110,12 +110,12 @@ function(op_library TARGET)
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
endif()

@ -469,8 +469,6 @@ REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
namespace ops = paddle::operators;
void DummyFunctor() {}
#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
__macro(Sigmoid, sigmoid); \
__macro(Relu, relu); \

@ -648,7 +648,7 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
bool Inplace() const { return IsInplace("softrelu"); }
bool Inplace() const { return IsInplace("soft_relu"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {

@ -153,9 +153,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
ops::BilinearTensorProductOpGrad);
REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
ops::BilinearTensorProductOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(bilinear_tensor_product_grad,
ops::BilinearTensorProductOpGrad)
REGISTER_OP_CPU_KERNEL(
bilinear_tensor_product,
ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,

@ -81,8 +81,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
ops::ClipOpGrad);
REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad)
REGISTER_OP_CPU_KERNEL(
clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save