Implement StaticModelRunner to support dygraph fine-tune static graph pre-training model (#23171)
* static model runner basic implement, test=develop * add run program op to execute loaded program, test=develop * refactor static model runner & run program op, test=develop * reset engine.cc to resolve conflict * adapt the change of dygraph double grad, test=develop * refactor impl to solve control flow error, test=develop * clear debug code, test=develop * fix ci str compatible error & checkout dygraph grad maker & add example, test=develop * hide api & add op test, test=develop * fix run program op test places error, test=develop * fix program by review comment, test=develop * delete change var desc name, test=develop * fix other program by review comment, test=develop * remove _static_graph_guard, test=develop * add selectedrows test, test=develop * remove desc parser, test=develop * fix detail program, test=develop * change socpe create & add test, test=developrevert-23830-2.0-beta
parent
9297f49e4b
commit
75bd350710
@ -0,0 +1,185 @@
|
||||
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/run_program_op.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
class RunProgramOp : public framework::OperatorWithKernel {
|
||||
public:
|
||||
using framework::OperatorWithKernel::OperatorWithKernel;
|
||||
|
||||
void InferShape(framework::InferShapeContext* ctx) const override {
|
||||
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
|
||||
platform::errors::NotFound(
|
||||
"Input(X) of RunProgramOp should not be null."));
|
||||
PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
|
||||
platform::errors::NotFound(
|
||||
"Input(Params) of RunProgramOp should not be null."));
|
||||
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
|
||||
platform::errors::NotFound(
|
||||
"Output(Out) of RunProgramOp should not be null."));
|
||||
}
|
||||
|
||||
protected:
|
||||
/* [Why use single type kernel]:
|
||||
*
|
||||
* This op is similar to a control flow op, it doses not need
|
||||
* a op kernel, but in order to make it execute under dynamic
|
||||
* graph mode, implement it with op kernel.
|
||||
*
|
||||
* So whether the kernel data type is int, float or other type,
|
||||
* which has no effect on its execution logic, so directly
|
||||
* specified a data type here.
|
||||
*
|
||||
* Of course, the data type here is also not important.
|
||||
*/
|
||||
framework::OpKernelType GetExpectedKernelType(
|
||||
const framework::ExecutionContext& ctx) const override {
|
||||
return framework::OpKernelType(framework::proto::VarType::FP32,
|
||||
ctx.GetPlace());
|
||||
}
|
||||
|
||||
framework::OpKernelType GetKernelTypeForVar(
|
||||
const std::string& var_name, const framework::Tensor& tensor,
|
||||
const framework::OpKernelType& expected_kernel_type) const override {
|
||||
return expected_kernel_type;
|
||||
}
|
||||
};
|
||||
|
||||
class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
void Make() override {
|
||||
AddInput("X",
|
||||
"(vector<LoDTensor>)"
|
||||
"The input tensors of RunProgram operator, also the feed targets "
|
||||
"of loaded program.")
|
||||
.AsDuplicable();
|
||||
AddInput("Params",
|
||||
"(vector<LoDTensor or SelecetedRows>)"
|
||||
"The input parameter of RunProgram operator, also the parameters "
|
||||
"of the loaded program.")
|
||||
.AsDuplicable();
|
||||
AddOutput("Out",
|
||||
"(vector<LoDTensor>)"
|
||||
"The output tensors of RunProgram operator, also the fetch "
|
||||
"targets of the loaded program.")
|
||||
.AsDuplicable();
|
||||
AddOutput("OutScope",
|
||||
"(StepScopeVar)"
|
||||
"A vector of execution scope in RunProgram operator, which "
|
||||
"contains at most one scope."
|
||||
"NOTE: Do not use Scope directly because Scope output is not "
|
||||
"currently supported.");
|
||||
AddAttr<BlockDesc*>("global_block",
|
||||
"(BlockDesc *)"
|
||||
"The global block of executed program desc.");
|
||||
AddAttr<int64_t>("start_op_index",
|
||||
"(int64_t)"
|
||||
"The index of the op to start execution");
|
||||
AddAttr<int64_t>("end_op_index",
|
||||
"(int64_t)"
|
||||
"The index of the op to stop execution");
|
||||
AddAttr<bool>("is_test",
|
||||
"(bool, default false) Set to true for inference only, false "
|
||||
"for training.")
|
||||
.SetDefault(false);
|
||||
AddComment(R"DOC(
|
||||
RunProgram operator.
|
||||
|
||||
The RunProgram operator receives a program's feed targets, fetch targets,
|
||||
and parameters, and receives the forward and backward program desc
|
||||
as attributes, and then executes the program by executor.
|
||||
|
||||
NOTE: This operator is added so that the inference model stored by
|
||||
`fluid.io.save_inference_model` under the static graph mode can be loaded
|
||||
under the dynamic graph mode for fine-tuning or inferencing.
|
||||
|
||||
)DOC");
|
||||
}
|
||||
};
|
||||
|
||||
class RunProgramGradOp : public framework::OperatorWithKernel {
|
||||
public:
|
||||
using framework::OperatorWithKernel::OperatorWithKernel;
|
||||
|
||||
void InferShape(framework::InferShapeContext* ctx) const override {
|
||||
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
|
||||
platform::errors::NotFound(
|
||||
"Input(X) of RunProgramGradOp should not be null."));
|
||||
PADDLE_ENFORCE_EQ(
|
||||
ctx->HasInputs("Params"), true,
|
||||
platform::errors::NotFound(
|
||||
"Input(Params) of RunProgramGradOp should not be null."));
|
||||
PADDLE_ENFORCE_EQ(
|
||||
ctx->HasInputs(framework::GradVarName("Out")), true,
|
||||
platform::errors::NotFound(
|
||||
"Input(Out@GRAD) of RunProgramGradOp should not be null."));
|
||||
// NOTE: The X@GRAD and Params@GRAD may not exist,
|
||||
// because they can be set stop_gradient = True
|
||||
}
|
||||
|
||||
protected:
|
||||
/* see [Why use single type kernel] */
|
||||
framework::OpKernelType GetExpectedKernelType(
|
||||
const framework::ExecutionContext& ctx) const override {
|
||||
return framework::OpKernelType(framework::proto::VarType::FP32,
|
||||
ctx.GetPlace());
|
||||
}
|
||||
|
||||
framework::OpKernelType GetKernelTypeForVar(
|
||||
const std::string& var_name, const framework::Tensor& tensor,
|
||||
const framework::OpKernelType& expected_kernel_type) const override {
|
||||
return expected_kernel_type;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
|
||||
public:
|
||||
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
|
||||
|
||||
protected:
|
||||
void Apply(GradOpPtr<T> grad_op) const override {
|
||||
grad_op->SetType("run_program_grad");
|
||||
grad_op->SetInput("X", this->Input("X"));
|
||||
grad_op->SetInput("Params", this->Input("Params"));
|
||||
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
|
||||
grad_op->SetInput("OutScope", this->Output("OutScope"));
|
||||
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
|
||||
grad_op->SetOutput(framework::GradVarName("Params"),
|
||||
this->InputGrad("Params"));
|
||||
grad_op->SetAttrMap(this->Attrs());
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OPERATOR(run_program, ops::RunProgramOp, ops::RunProgramOpMaker,
|
||||
ops::RunProgramGradOpMaker<paddle::framework::OpDesc>,
|
||||
ops::RunProgramGradOpMaker<paddle::imperative::OpBase>);
|
||||
REGISTER_OPERATOR(run_program_grad, ops::RunProgramGradOp);
|
||||
|
||||
/* see [Why use single type kernel] */
|
||||
REGISTER_OP_CPU_KERNEL(
|
||||
run_program,
|
||||
ops::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float>)
|
||||
REGISTER_OP_CPU_KERNEL(
|
||||
run_program_grad,
|
||||
ops::RunProgramGradOpKernel<paddle::platform::CPUDeviceContext, float>)
|
@ -0,0 +1,28 @@
|
||||
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/fluid/operators/run_program_op.h"
|
||||
|
||||
#include "paddle/fluid/platform/float16.h"
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
namespace plat = paddle::platform;
|
||||
|
||||
/* see [Why use single type kernel] */
|
||||
REGISTER_OP_CUDA_KERNEL(
|
||||
run_program,
|
||||
ops::RunProgramOpKernel<paddle::platform::CUDADeviceContext, float>);
|
||||
REGISTER_OP_CUDA_KERNEL(
|
||||
run_program_grad,
|
||||
ops::RunProgramGradOpKernel<paddle::platform::CUDADeviceContext, float>);
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,235 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
|
||||
import contextlib
|
||||
import numpy as np
|
||||
import six
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid import core
|
||||
from test_imperative_base import new_program_scope
|
||||
|
||||
import paddle.fluid.transpiler.details.program_utils as pu
|
||||
|
||||
|
||||
def while_softmax_regression(img):
|
||||
def cond(i, times, pred):
|
||||
return i < times
|
||||
|
||||
def body(i, times, pred):
|
||||
pred = fluid.layers.fc(input=pred, size=10, act='softmax')
|
||||
i = i + 1
|
||||
return [i, times, pred]
|
||||
|
||||
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
|
||||
times = fluid.layers.fill_constant(shape=[1], dtype='int64', value=5)
|
||||
pred = fluid.layers.fc(input=img, size=10, act='softmax')
|
||||
i, times, pred = fluid.layers.while_loop(
|
||||
cond=cond, body=body, loop_vars=[i, times, pred])
|
||||
return pred
|
||||
|
||||
|
||||
class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.seed = 90
|
||||
self.batch_size = 32
|
||||
self.batch_num = 50
|
||||
self.save_dirname = "while.inference.model"
|
||||
self.model_filename = None
|
||||
self.params_filename = None
|
||||
|
||||
def _random_batch_reader(self):
|
||||
def _get_random_images_and_labels(image_shape, label_shape):
|
||||
image = np.random.random(size=image_shape).astype('float32')
|
||||
label = np.random.random(size=label_shape).astype('int64')
|
||||
return image, label
|
||||
|
||||
def __reader__():
|
||||
for _ in range(self.batch_num):
|
||||
batch_image, batch_label = _get_random_images_and_labels(
|
||||
[self.batch_size, 784], [self.batch_size, 1])
|
||||
yield batch_image, batch_label
|
||||
|
||||
return __reader__
|
||||
|
||||
def train_and_save_model(self):
|
||||
startup_program = fluid.default_startup_program()
|
||||
main_program = fluid.default_main_program()
|
||||
|
||||
img = fluid.data(name='img', shape=[None, 784], dtype='float32')
|
||||
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
|
||||
|
||||
pred = while_softmax_regression(img)
|
||||
|
||||
loss = fluid.layers.cross_entropy(input=pred, label=label)
|
||||
avg_loss = fluid.layers.mean(loss)
|
||||
|
||||
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
|
||||
optimizer.minimize(avg_loss)
|
||||
|
||||
# pu.program_to_code(main_program, skip_op_callstack=True)
|
||||
|
||||
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
||||
) else fluid.CPUPlace()
|
||||
|
||||
exe = fluid.Executor(place)
|
||||
exe.run(startup_program)
|
||||
|
||||
loader = fluid.io.DataLoader.from_generator(
|
||||
feed_list=[img, label], capacity=5, iterable=True)
|
||||
loader.set_batch_generator(self._random_batch_reader(), places=place)
|
||||
|
||||
for data in loader():
|
||||
exe.run(main_program, feed=data, fetch_list=[avg_loss])
|
||||
|
||||
fluid.io.save_inference_model(
|
||||
self.save_dirname, ["img"], [pred],
|
||||
exe,
|
||||
model_filename=self.model_filename,
|
||||
params_filename=self.params_filename)
|
||||
|
||||
def load_and_train_dygraph(self):
|
||||
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
||||
) else fluid.CPUPlace()
|
||||
with fluid.dygraph.guard(place):
|
||||
fluid.default_startup_program().random_seed = self.seed
|
||||
fluid.default_main_program().random_seed = self.seed
|
||||
np.random.seed(self.seed)
|
||||
|
||||
backward_strategy = fluid.dygraph.BackwardStrategy()
|
||||
backward_strategy.sort_sum_gradient = True
|
||||
|
||||
while_net = fluid.dygraph.static_runner.StaticModelRunner(
|
||||
self.save_dirname)
|
||||
|
||||
dy_param_init_value = {}
|
||||
for param in while_net.parameters():
|
||||
dy_param_init_value[param.name] = param.numpy()
|
||||
|
||||
sgd = fluid.optimizer.SGD(learning_rate=0.001,
|
||||
parameter_list=while_net.parameters())
|
||||
|
||||
train_loader = fluid.io.DataLoader.from_generator(capacity=10)
|
||||
train_loader.set_batch_generator(
|
||||
self._random_batch_reader(), places=place)
|
||||
|
||||
while_net.train()
|
||||
|
||||
for data in train_loader():
|
||||
img = data[0]
|
||||
label = data[1]
|
||||
label.stop_gradient = True
|
||||
|
||||
cost = while_net(inputs=img)
|
||||
|
||||
loss = fluid.layers.cross_entropy(cost, label)
|
||||
avg_loss = fluid.layers.mean(loss)
|
||||
|
||||
avg_loss.backward(backward_strategy)
|
||||
sgd.minimize(avg_loss)
|
||||
while_net.clear_gradients()
|
||||
|
||||
dy_out = avg_loss.numpy()
|
||||
dy_param_value = {}
|
||||
for param in while_net.parameters():
|
||||
dy_param_value[param.name] = param.numpy()
|
||||
|
||||
return dy_out, dy_param_init_value, dy_param_value
|
||||
|
||||
def load_and_train_static(self):
|
||||
with new_program_scope():
|
||||
fluid.default_startup_program().random_seed = self.seed
|
||||
fluid.default_main_program().random_seed = self.seed
|
||||
np.random.seed(self.seed)
|
||||
|
||||
img = fluid.data(name='img', shape=[None, 784], dtype='float32')
|
||||
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
|
||||
|
||||
pred = while_softmax_regression(img)
|
||||
|
||||
loss = fluid.layers.cross_entropy(input=pred, label=label)
|
||||
avg_loss = fluid.layers.mean(loss)
|
||||
|
||||
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
|
||||
optimizer.minimize(avg_loss)
|
||||
|
||||
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
||||
) else fluid.CPUPlace()
|
||||
|
||||
exe = fluid.Executor(place)
|
||||
exe.run(fluid.default_startup_program())
|
||||
|
||||
fluid.io.load_params(
|
||||
exe,
|
||||
self.save_dirname,
|
||||
main_program=fluid.default_main_program(),
|
||||
filename=self.params_filename)
|
||||
|
||||
static_param_init_value = {}
|
||||
static_param_name_list = []
|
||||
for param in fluid.default_main_program().all_parameters():
|
||||
static_param_name_list.append(param.name)
|
||||
static_param_init_value[param.name] = fluid.executor._fetch_var(
|
||||
param.name)
|
||||
|
||||
loader = fluid.io.DataLoader.from_generator(
|
||||
feed_list=[img, label], capacity=5, iterable=True)
|
||||
loader.set_batch_generator(
|
||||
self._random_batch_reader(), places=place)
|
||||
|
||||
for data in loader():
|
||||
fetch_list = [avg_loss.name]
|
||||
fetch_list.extend(static_param_name_list)
|
||||
|
||||
out = exe.run(fluid.default_main_program(),
|
||||
feed=data,
|
||||
fetch_list=[avg_loss])
|
||||
|
||||
static_param_value = {}
|
||||
static_out = out[0]
|
||||
for i in range(1, len(out)):
|
||||
static_param_value[static_param_name_list[i - 1]] = out[i]
|
||||
|
||||
return static_out, static_param_init_value, static_param_value
|
||||
|
||||
def test_while_no_params_filename(self):
|
||||
# Phase 1. run and save static model
|
||||
self.train_and_save_model()
|
||||
|
||||
# # Phase 2. load model & train dygraph
|
||||
dy_out, dy_param_init_value, dy_param_value = \
|
||||
self.load_and_train_dygraph()
|
||||
|
||||
static_out, static_param_init_value, static_param_value = \
|
||||
self.load_and_train_static()
|
||||
|
||||
# Phase 3. compare
|
||||
for key, value in six.iteritems(static_param_init_value):
|
||||
key += core.loaded_var_suffix()
|
||||
self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
|
||||
|
||||
self.assertTrue(np.allclose(static_out, dy_out))
|
||||
|
||||
for key, value in six.iteritems(static_param_value):
|
||||
key += core.loaded_var_suffix()
|
||||
self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue