|
|
|
@ -22,39 +22,66 @@ namespace details {
|
|
|
|
|
ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
|
|
|
|
|
Scope *scope,
|
|
|
|
|
platform::Place place,
|
|
|
|
|
platform::DeviceContext *dev_ctx)
|
|
|
|
|
platform::DeviceContext *dev_ctx,
|
|
|
|
|
proto::VarType::Type dtype)
|
|
|
|
|
: OpHandleBase(node),
|
|
|
|
|
coeff_(static_cast<float>(1.0 / num_dev)),
|
|
|
|
|
scope_(scope),
|
|
|
|
|
place_(place) {
|
|
|
|
|
place_(place),
|
|
|
|
|
out_dtype_(dtype) {
|
|
|
|
|
this->SetDeviceContext(place_, dev_ctx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
|
|
|
|
|
|
|
|
|
|
struct ScaleLossGradFunctor {
|
|
|
|
|
float coeff_;
|
|
|
|
|
Tensor *out_;
|
|
|
|
|
platform::Place place_;
|
|
|
|
|
OpHandleBase *op_handle_;
|
|
|
|
|
proto::VarType::Type out_dtype_;
|
|
|
|
|
platform::DeviceContext *ctx_;
|
|
|
|
|
|
|
|
|
|
ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
|
|
|
|
|
OpHandleBase *op_handle, proto::VarType::Type dtype,
|
|
|
|
|
platform::DeviceContext *ctx)
|
|
|
|
|
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
|
|
|
|
|
|
|
|
|
|
template <typename OutT>
|
|
|
|
|
void apply() const {
|
|
|
|
|
auto *out_data = out_->mutable_data<OutT>(place_);
|
|
|
|
|
if (platform::is_cpu_place(place_)) {
|
|
|
|
|
*out_data = static_cast<OutT>(coeff_);
|
|
|
|
|
} else {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
OutT cast_coeff = static_cast<OutT>(coeff_);
|
|
|
|
|
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
|
|
|
|
|
memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
|
|
|
|
|
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
|
|
|
|
|
stream);
|
|
|
|
|
VLOG(10) << place_ << "RUN Scale loss grad op";
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void ScaleLossGradOpHandle::RunImpl() {
|
|
|
|
|
// Doesn't wait any event
|
|
|
|
|
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
|
|
|
|
|
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
|
|
|
|
|
|
|
|
|
|
float *tmp = local_scope.FindVar(var_name)
|
|
|
|
|
->GetMutable<LoDTensor>()
|
|
|
|
|
->mutable_data<float>(make_ddim({1}), place_);
|
|
|
|
|
auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
|
|
|
|
|
tensor->Resize(make_ddim({1}));
|
|
|
|
|
|
|
|
|
|
if (platform::is_cpu_place(place_)) {
|
|
|
|
|
*tmp = coeff_;
|
|
|
|
|
} else {
|
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
|
|
|
this->RunAndRecordEvent([&] {
|
|
|
|
|
auto stream = static_cast<platform::CUDADeviceContext *>(
|
|
|
|
|
this->dev_ctxes_.at(place_))
|
|
|
|
|
->stream();
|
|
|
|
|
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
|
|
|
|
|
platform::CPUPlace(), &coeff_, sizeof(float), stream);
|
|
|
|
|
VLOG(10) << place_ << "RUN Scale loss grad op";
|
|
|
|
|
});
|
|
|
|
|
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
|
|
|
|
|
this->dev_ctxes_.at(place_));
|
|
|
|
|
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
|
|
|
|
|
#else
|
|
|
|
|
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
|
|
|
|
|
framework::VisitDataType(out_dtype_, func);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
|
|
|
|
|