commit
d3534d2b14
@ -0,0 +1,55 @@
|
|||||||
|
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
|
||||||
|
#include "paddle/fluid/framework/details/container_cast.h"
|
||||||
|
#include "paddle/fluid/framework/details/variable_visitor.h"
|
||||||
|
#include "paddle/fluid/platform/profiler.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace framework {
|
||||||
|
namespace details {
|
||||||
|
|
||||||
|
void FusedBroadcastOpHandle::RunImpl() {
|
||||||
|
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
|
||||||
|
|
||||||
|
if (places_.size() == 1UL) return;
|
||||||
|
|
||||||
|
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
|
||||||
|
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
|
||||||
|
|
||||||
|
WaitInputVarGenerated();
|
||||||
|
|
||||||
|
std::vector<const Scope *> var_scopes;
|
||||||
|
for (auto *s : local_scopes_) {
|
||||||
|
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t place_num = places_.size();
|
||||||
|
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < in_var_handles.size(); ++i) {
|
||||||
|
BroadcastOneVar(
|
||||||
|
*in_var_handles[i],
|
||||||
|
std::vector<VarHandle *>(out_var_handles.begin() + i * place_num,
|
||||||
|
out_var_handles.begin() + (i + 1) * place_num),
|
||||||
|
var_scopes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
|
||||||
|
|
||||||
|
} // namespace details
|
||||||
|
} // namespace framework
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,57 @@
|
|||||||
|
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
|
||||||
|
#include "paddle/fluid/framework/details/multi_devices_helper.h"
|
||||||
|
#include "paddle/fluid/framework/lod_tensor.h"
|
||||||
|
#include "paddle/fluid/framework/scope.h"
|
||||||
|
#include "paddle/fluid/framework/selected_rows.h"
|
||||||
|
#include "paddle/fluid/platform/device_context.h"
|
||||||
|
|
||||||
|
#ifdef PADDLE_WITH_CUDA
|
||||||
|
#include "paddle/fluid/platform/nccl_helper.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace framework {
|
||||||
|
namespace details {
|
||||||
|
|
||||||
|
struct FusedBroadcastOpHandle : public BroadcastOpHandle {
|
||||||
|
public:
|
||||||
|
#ifdef PADDLE_WITH_CUDA
|
||||||
|
FusedBroadcastOpHandle(ir::Node *node,
|
||||||
|
const std::vector<Scope *> local_scopes,
|
||||||
|
const std::vector<platform::Place> &places,
|
||||||
|
const platform::NCCLContextMap *nccl_ctx)
|
||||||
|
: BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
|
||||||
|
#else
|
||||||
|
FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
|
||||||
|
const std::vector<platform::Place>& places)
|
||||||
|
: BroadcastOpHandle(node, local_scopes, places) {}
|
||||||
|
#endif
|
||||||
|
std::string Name() const override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void RunImpl() override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace details
|
||||||
|
} // namespace framework
|
||||||
|
} // namespace paddle
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,44 @@
|
|||||||
|
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
|
||||||
|
#include "paddle/fluid/framework/ir/graph.h"
|
||||||
|
#include "paddle/fluid/framework/ir/pass.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace framework {
|
||||||
|
namespace ir {
|
||||||
|
|
||||||
|
// BatchMergePass is used to copy forward and backward ops for several
|
||||||
|
// times to run several batches to simulate large batch size training
|
||||||
|
// as if we have more than 1 GPUs.
|
||||||
|
// User can define how many batches to run, gradients will be merged
|
||||||
|
// through those repeats, and then do optimization using merged gradients.
|
||||||
|
// This pass is extremely useful when doing large batch-size distributed
|
||||||
|
// sync training, we can simulate even large batch size as if we have more
|
||||||
|
// GPUs.
|
||||||
|
|
||||||
|
class BatchMergePass : public Pass {
|
||||||
|
public:
|
||||||
|
virtual ~BatchMergePass() {}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace ir
|
||||||
|
} // namespace framework
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,86 @@
|
|||||||
|
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include "paddle/fluid/operators/lars_momentum_op.h"
|
||||||
|
#include "paddle/fluid/operators/momentum_op.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace operators {
|
||||||
|
|
||||||
|
class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||||
|
public:
|
||||||
|
void Make() override {
|
||||||
|
AddInput("Param",
|
||||||
|
"(LoDTensor, default LoDTensor<float>) "
|
||||||
|
"Input parameter that has to be updated");
|
||||||
|
AddInput("Grad",
|
||||||
|
"(LoDTensor, default LoDTensor<float>) "
|
||||||
|
"Input gradient of the parameter");
|
||||||
|
AddInput("Velocity",
|
||||||
|
"(LoDTensor, default LoDTensor<float>) "
|
||||||
|
"Input velocity (corresponding to the parameter) "
|
||||||
|
"that has to be updated");
|
||||||
|
AddInput("LearningRate",
|
||||||
|
"(LoDTensor, default LoDTensor<float>) "
|
||||||
|
"Input learning rate");
|
||||||
|
|
||||||
|
AddOutput("ParamOut",
|
||||||
|
"(LoDTensor) This output is updated parameter. "
|
||||||
|
"It shared memory with Input(Param).");
|
||||||
|
AddOutput("VelocityOut",
|
||||||
|
"(LoDTensor) This output is updated velocity. "
|
||||||
|
"It shared memory with Input(Velocity).");
|
||||||
|
|
||||||
|
AddAttr<float>("mu", "(float) Momentum coefficient");
|
||||||
|
AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
|
||||||
|
.SetDefault(0.001);
|
||||||
|
AddAttr<float>("lars_weight_decay",
|
||||||
|
"(float, default 0.0005) LARS weight decay")
|
||||||
|
.SetDefault(0.0005);
|
||||||
|
|
||||||
|
AddComment(R"DOC(
|
||||||
|
Lars Momentum Optimizer.
|
||||||
|
|
||||||
|
This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
|
||||||
|
weight using a local learning rate:
|
||||||
|
|
||||||
|
$$
|
||||||
|
local\_lr = \eta *
|
||||||
|
\frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
|
||||||
|
velocity = mu * velocity +
|
||||||
|
local\_lr * (grad + \beta * param) \\
|
||||||
|
param = param - velocity. \\
|
||||||
|
$$
|
||||||
|
|
||||||
|
Note that we use lars_weight_decay here to decay weights, you may need not to
|
||||||
|
use L2 regularizers in case of using LARS.
|
||||||
|
|
||||||
|
)DOC");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
|
||||||
|
public:
|
||||||
|
void operator()(const framework::OpDesc &op_desc,
|
||||||
|
framework::BlockDesc *block) const override {}
|
||||||
|
};
|
||||||
|
} // namespace operators
|
||||||
|
} // namespace paddle
|
||||||
|
|
||||||
|
namespace ops = paddle::operators;
|
||||||
|
REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
|
||||||
|
paddle::framework::EmptyGradOpMaker,
|
||||||
|
ops::LarsMomentumOpVarTypeInference);
|
||||||
|
REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel<float>,
|
||||||
|
ops::LarsMomentumOpKernel<double>);
|
@ -0,0 +1,94 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include "paddle/fluid/framework/op_registry.h"
|
||||||
|
#include "paddle/fluid/operators/lars_momentum_op.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace operators {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
|
||||||
|
const T* learning_rate, const T mu,
|
||||||
|
const int64_t num, const T lars_coeff,
|
||||||
|
const T lars_weight_decay, const T* p_norm,
|
||||||
|
const T* g_norm, T* p_out, T* v_out) {
|
||||||
|
T lr = learning_rate[0];
|
||||||
|
T local_lr = learning_rate[0];
|
||||||
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
|
||||||
|
i += blockDim.x * gridDim.x) {
|
||||||
|
if (p_norm[0] > 0 && g_norm[0] > 0) {
|
||||||
|
local_lr = lr * lars_coeff * p_norm[0] /
|
||||||
|
(g_norm[0] + lars_weight_decay * p_norm[0]);
|
||||||
|
}
|
||||||
|
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
|
||||||
|
v_out[i] = v_new;
|
||||||
|
p_out[i] = p[i] - v_new;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename DeviceContext, typename T>
|
||||||
|
class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
|
||||||
|
public:
|
||||||
|
void Compute(const framework::ExecutionContext& ctx) const override {
|
||||||
|
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
|
||||||
|
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
|
||||||
|
auto param = ctx.Input<framework::LoDTensor>("Param");
|
||||||
|
auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
|
||||||
|
auto grad = ctx.Input<framework::LoDTensor>("Grad");
|
||||||
|
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
|
||||||
|
|
||||||
|
T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
|
||||||
|
T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
|
||||||
|
|
||||||
|
T mu = static_cast<T>(ctx.Attr<float>("mu"));
|
||||||
|
T lars_coeff = ctx.Attr<float>("lars_coeff");
|
||||||
|
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
|
||||||
|
|
||||||
|
auto* p = param->data<T>();
|
||||||
|
auto* v = velocity->data<T>();
|
||||||
|
auto* g = grad->data<T>();
|
||||||
|
auto* lr = learning_rate->data<T>();
|
||||||
|
|
||||||
|
int block = 512;
|
||||||
|
int grid = (param->numel() + block - 1) / block;
|
||||||
|
|
||||||
|
auto eigen_p = framework::EigenVector<T>::Flatten(*param);
|
||||||
|
auto eigen_g = framework::EigenVector<T>::Flatten(*grad);
|
||||||
|
// calculate norms using eigein and launch the kernel.
|
||||||
|
framework::Tensor p_norm_t, g_norm_t;
|
||||||
|
p_norm_t.Resize({1});
|
||||||
|
g_norm_t.Resize({1});
|
||||||
|
auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
|
||||||
|
auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
|
||||||
|
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
|
||||||
|
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
|
||||||
|
|
||||||
|
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
|
||||||
|
ep_norm.device(*place) = eigen_p.square().sum().sqrt();
|
||||||
|
eg_norm.device(*place) = eigen_g.square().sum().sqrt();
|
||||||
|
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
|
||||||
|
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
|
||||||
|
p_norm_data, g_norm_data, p_out, v_out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace operators
|
||||||
|
} // namespace paddle
|
||||||
|
|
||||||
|
namespace ops = paddle::operators;
|
||||||
|
REGISTER_OP_CUDA_KERNEL(
|
||||||
|
lars_momentum,
|
||||||
|
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
|
||||||
|
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue