Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cpu-for-1.1-merge-with-shape
commit
d26ff8cb2d
@ -0,0 +1,55 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
|
||||
#include "paddle/fluid/framework/details/container_cast.h"
|
||||
#include "paddle/fluid/framework/details/variable_visitor.h"
|
||||
#include "paddle/fluid/platform/profiler.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
namespace details {
|
||||
|
||||
void FusedBroadcastOpHandle::RunImpl() {
|
||||
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
|
||||
|
||||
if (places_.size() == 1UL) return;
|
||||
|
||||
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
|
||||
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
|
||||
|
||||
WaitInputVarGenerated();
|
||||
|
||||
std::vector<const Scope *> var_scopes;
|
||||
for (auto *s : local_scopes_) {
|
||||
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
|
||||
}
|
||||
|
||||
size_t place_num = places_.size();
|
||||
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
|
||||
|
||||
for (size_t i = 0; i < in_var_handles.size(); ++i) {
|
||||
BroadcastOneVar(
|
||||
*in_var_handles[i],
|
||||
std::vector<VarHandle *>(out_var_handles.begin() + i * place_num,
|
||||
out_var_handles.begin() + (i + 1) * place_num),
|
||||
var_scopes);
|
||||
}
|
||||
}
|
||||
|
||||
std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
|
||||
|
||||
} // namespace details
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,57 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
|
||||
#include "paddle/fluid/framework/details/multi_devices_helper.h"
|
||||
#include "paddle/fluid/framework/lod_tensor.h"
|
||||
#include "paddle/fluid/framework/scope.h"
|
||||
#include "paddle/fluid/framework/selected_rows.h"
|
||||
#include "paddle/fluid/platform/device_context.h"
|
||||
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
#include "paddle/fluid/platform/nccl_helper.h"
|
||||
#endif
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
namespace details {
|
||||
|
||||
struct FusedBroadcastOpHandle : public BroadcastOpHandle {
|
||||
public:
|
||||
#ifdef PADDLE_WITH_CUDA
|
||||
FusedBroadcastOpHandle(ir::Node *node,
|
||||
const std::vector<Scope *> local_scopes,
|
||||
const std::vector<platform::Place> &places,
|
||||
const platform::NCCLContextMap *nccl_ctx)
|
||||
: BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
|
||||
#else
|
||||
FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
|
||||
const std::vector<platform::Place>& places)
|
||||
: BroadcastOpHandle(node, local_scopes, places) {}
|
||||
#endif
|
||||
std::string Name() const override;
|
||||
|
||||
protected:
|
||||
void RunImpl() override;
|
||||
};
|
||||
|
||||
} // namespace details
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,44 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
|
||||
#include "paddle/fluid/framework/ir/graph.h"
|
||||
#include "paddle/fluid/framework/ir/pass.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
namespace ir {
|
||||
|
||||
// BatchMergePass is used to copy forward and backward ops for several
|
||||
// times to run several batches to simulate large batch size training
|
||||
// as if we have more than 1 GPUs.
|
||||
// User can define how many batches to run, gradients will be merged
|
||||
// through those repeats, and then do optimization using merged gradients.
|
||||
// This pass is extremely useful when doing large batch-size distributed
|
||||
// sync training, we can simulate even large batch size as if we have more
|
||||
// GPUs.
|
||||
|
||||
class BatchMergePass : public Pass {
|
||||
public:
|
||||
virtual ~BatchMergePass() {}
|
||||
|
||||
protected:
|
||||
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
|
||||
};
|
||||
|
||||
} // namespace ir
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,86 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/lars_momentum_op.h"
|
||||
#include "paddle/fluid/operators/momentum_op.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
void Make() override {
|
||||
AddInput("Param",
|
||||
"(LoDTensor, default LoDTensor<float>) "
|
||||
"Input parameter that has to be updated");
|
||||
AddInput("Grad",
|
||||
"(LoDTensor, default LoDTensor<float>) "
|
||||
"Input gradient of the parameter");
|
||||
AddInput("Velocity",
|
||||
"(LoDTensor, default LoDTensor<float>) "
|
||||
"Input velocity (corresponding to the parameter) "
|
||||
"that has to be updated");
|
||||
AddInput("LearningRate",
|
||||
"(LoDTensor, default LoDTensor<float>) "
|
||||
"Input learning rate");
|
||||
|
||||
AddOutput("ParamOut",
|
||||
"(LoDTensor) This output is updated parameter. "
|
||||
"It shared memory with Input(Param).");
|
||||
AddOutput("VelocityOut",
|
||||
"(LoDTensor) This output is updated velocity. "
|
||||
"It shared memory with Input(Velocity).");
|
||||
|
||||
AddAttr<float>("mu", "(float) Momentum coefficient");
|
||||
AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
|
||||
.SetDefault(0.001);
|
||||
AddAttr<float>("lars_weight_decay",
|
||||
"(float, default 0.0005) LARS weight decay")
|
||||
.SetDefault(0.0005);
|
||||
|
||||
AddComment(R"DOC(
|
||||
Lars Momentum Optimizer.
|
||||
|
||||
This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
|
||||
weight using a local learning rate:
|
||||
|
||||
$$
|
||||
local\_lr = \eta *
|
||||
\frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
|
||||
velocity = mu * velocity +
|
||||
local\_lr * (grad + \beta * param) \\
|
||||
param = param - velocity. \\
|
||||
$$
|
||||
|
||||
Note that we use lars_weight_decay here to decay weights, you may need not to
|
||||
use L2 regularizers in case of using LARS.
|
||||
|
||||
)DOC");
|
||||
}
|
||||
};
|
||||
|
||||
class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
|
||||
public:
|
||||
void operator()(const framework::OpDesc &op_desc,
|
||||
framework::BlockDesc *block) const override {}
|
||||
};
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
|
||||
paddle::framework::EmptyGradOpMaker,
|
||||
ops::LarsMomentumOpVarTypeInference);
|
||||
REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel<float>,
|
||||
ops::LarsMomentumOpKernel<double>);
|
@ -0,0 +1,94 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/operators/lars_momentum_op.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
|
||||
const T* learning_rate, const T mu,
|
||||
const int64_t num, const T lars_coeff,
|
||||
const T lars_weight_decay, const T* p_norm,
|
||||
const T* g_norm, T* p_out, T* v_out) {
|
||||
T lr = learning_rate[0];
|
||||
T local_lr = learning_rate[0];
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
|
||||
i += blockDim.x * gridDim.x) {
|
||||
if (p_norm[0] > 0 && g_norm[0] > 0) {
|
||||
local_lr = lr * lars_coeff * p_norm[0] /
|
||||
(g_norm[0] + lars_weight_decay * p_norm[0]);
|
||||
}
|
||||
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
|
||||
v_out[i] = v_new;
|
||||
p_out[i] = p[i] - v_new;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DeviceContext, typename T>
|
||||
class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
|
||||
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
|
||||
auto param = ctx.Input<framework::LoDTensor>("Param");
|
||||
auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
|
||||
auto grad = ctx.Input<framework::LoDTensor>("Grad");
|
||||
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
|
||||
|
||||
T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
|
||||
T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
|
||||
|
||||
T mu = static_cast<T>(ctx.Attr<float>("mu"));
|
||||
T lars_coeff = ctx.Attr<float>("lars_coeff");
|
||||
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
|
||||
|
||||
auto* p = param->data<T>();
|
||||
auto* v = velocity->data<T>();
|
||||
auto* g = grad->data<T>();
|
||||
auto* lr = learning_rate->data<T>();
|
||||
|
||||
int block = 512;
|
||||
int grid = (param->numel() + block - 1) / block;
|
||||
|
||||
auto eigen_p = framework::EigenVector<T>::Flatten(*param);
|
||||
auto eigen_g = framework::EigenVector<T>::Flatten(*grad);
|
||||
// calculate norms using eigein and launch the kernel.
|
||||
framework::Tensor p_norm_t, g_norm_t;
|
||||
p_norm_t.Resize({1});
|
||||
g_norm_t.Resize({1});
|
||||
auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
|
||||
auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
|
||||
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
|
||||
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
|
||||
|
||||
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
|
||||
ep_norm.device(*place) = eigen_p.square().sum().sqrt();
|
||||
eg_norm.device(*place) = eigen_g.square().sum().sqrt();
|
||||
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
|
||||
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
|
||||
p_norm_data, g_norm_data, p_out, v_out);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OP_CUDA_KERNEL(
|
||||
lars_momentum,
|
||||
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
|
||||
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
|
@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
#include "paddle/fluid/framework/eigen.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
class LarsMomentumOpKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
|
||||
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
|
||||
auto param = ctx.Input<framework::LoDTensor>("Param");
|
||||
auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
|
||||
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
|
||||
auto* grad_var = ctx.InputVar("Grad");
|
||||
// only support dense for now.
|
||||
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>());
|
||||
auto grad = ctx.Input<framework::LoDTensor>("Grad");
|
||||
|
||||
param_out->mutable_data<T>(ctx.GetPlace());
|
||||
velocity_out->mutable_data<T>(ctx.GetPlace());
|
||||
|
||||
T mu = static_cast<T>(ctx.Attr<float>("mu"));
|
||||
T lars_coeff = ctx.Attr<float>("lars_coeff");
|
||||
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
|
||||
|
||||
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
|
||||
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
|
||||
|
||||
auto p = framework::EigenVector<T>::Flatten(*param);
|
||||
auto v = framework::EigenVector<T>::Flatten(*velocity);
|
||||
auto g = framework::EigenVector<T>::Flatten(*grad);
|
||||
auto* lr = learning_rate->data<T>();
|
||||
|
||||
framework::Tensor p_norm_t, g_norm_t;
|
||||
p_norm_t.Resize({1});
|
||||
g_norm_t.Resize({1});
|
||||
p_norm_t.mutable_data<T>(ctx.GetPlace());
|
||||
g_norm_t.mutable_data<T>(ctx.GetPlace());
|
||||
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
|
||||
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
|
||||
|
||||
ep_norm = p.square().sum().sqrt();
|
||||
eg_norm = g.square().sum().sqrt();
|
||||
T local_lr = lr[0];
|
||||
if (ep_norm(0) > 0 && eg_norm(0) > 0) {
|
||||
local_lr = lr[0] * lars_coeff * ep_norm(0) /
|
||||
(eg_norm(0) + lars_weight_decay * ep_norm(0));
|
||||
}
|
||||
v_out = v * mu + local_lr * (g + lars_weight_decay * p);
|
||||
p_out = p - v_out;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue