Refine adam op to improve performance, test=develop (#22346)

* Refine adam op, test=develop

* Fuse kernels together to reduce cpu time.

* Refine paddle enforce, test=develop

* Remove some comments, test=develop

* Refine code,test=develop

* Refine cuda kernel, test=develop

* Refine code according to comments, test=develop
revert-22710-feature/integrated_ps_api
zhaoyuchen2018 5 years ago committed by GitHub
parent 8c381cd957
commit 72dde4abde
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -145,7 +145,8 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size, 0);
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< "), ";
<< ") "
<< " addres:" << lod_tensors[i]->data<void>() << ", ";
*numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place) /
size_of_dtype;
@ -160,6 +161,15 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {}
protected:
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override {
return framework::OpKernelType(expected_kernel_type.data_type_,
expected_kernel_type.place_,
tensor.layout());
}
};
class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {

@ -131,6 +131,17 @@ framework::OpKernelType AdamOp::GetExpectedKernelType(
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
framework::OpKernelType AdamOp::GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (var_name == "Beta1Pow" || var_name == "Beta2Pow") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -187,12 +187,21 @@ void SetTensorFromPyArrayT(
}
} else {
#ifdef PADDLE_WITH_CUDA
auto dst = self->mutable_data<T>(place);
T *dst;
if (array.nbytes() <= 4 && !paddle::platform::is_cuda_pinned_place(place)) {
dst = self->mutable_data<T>(platform::CPUPlace());
} else {
dst = self->mutable_data<T>(place);
}
if (paddle::platform::is_cuda_pinned_place(place)) {
std::memcpy(dst, array.data(), array.nbytes());
} else if (paddle::platform::is_gpu_place(place)) {
if (array.nbytes() <= 4) {
std::memcpy(dst, array.data(), array.nbytes());
} else {
paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
cudaMemcpyHostToDevice);
}
} else {
PADDLE_THROW(
"Incompatible place type: Tensor.set() supports CPUPlace, CUDAPlace "

@ -404,7 +404,8 @@ class Optimizer(object):
dtype=None,
fill_value=0.0,
shape=None,
type=None):
type=None,
force_cpu=False):
"""Utility function to add an accumulator for a parameter
Args:
@ -438,7 +439,9 @@ class Optimizer(object):
shape=shape,
belong_to_optimizer=True)
self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value)))
var,
initializer=Constant(
value=float(fill_value), force_cpu=force_cpu))
if framework.in_dygraph_mode():
if len(self._accumulators_holder) > 0:
@ -1790,14 +1793,14 @@ class AdamOptimizer(Optimizer):
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR, force_cpu=True)
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR, force_cpu=True)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)

Loading…
Cancel
Save