Implement StaticModelRunner to support dygraph fine-tune static graph pre-training model (#23171)
* static model runner basic implement, test=develop * add run program op to execute loaded program, test=develop * refactor static model runner & run program op, test=develop * reset engine.cc to resolve conflict * adapt the change of dygraph double grad, test=develop * refactor impl to solve control flow error, test=develop * clear debug code, test=develop * fix ci str compatible error & checkout dygraph grad maker & add example, test=develop * hide api & add op test, test=develop * fix run program op test places error, test=develop * fix program by review comment, test=develop * delete change var desc name, test=develop * fix other program by review comment, test=develop * remove _static_graph_guard, test=develop * add selectedrows test, test=develop * remove desc parser, test=develop * fix detail program, test=develop * change socpe create & add test, test=developrevert-23830-2.0-beta
parent
9297f49e4b
commit
75bd350710
@ -0,0 +1,185 @@
|
|||||||
|
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include "paddle/fluid/operators/run_program_op.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
namespace operators {
|
||||||
|
|
||||||
|
class RunProgramOp : public framework::OperatorWithKernel {
|
||||||
|
public:
|
||||||
|
using framework::OperatorWithKernel::OperatorWithKernel;
|
||||||
|
|
||||||
|
void InferShape(framework::InferShapeContext* ctx) const override {
|
||||||
|
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
|
||||||
|
platform::errors::NotFound(
|
||||||
|
"Input(X) of RunProgramOp should not be null."));
|
||||||
|
PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
|
||||||
|
platform::errors::NotFound(
|
||||||
|
"Input(Params) of RunProgramOp should not be null."));
|
||||||
|
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
|
||||||
|
platform::errors::NotFound(
|
||||||
|
"Output(Out) of RunProgramOp should not be null."));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/* [Why use single type kernel]:
|
||||||
|
*
|
||||||
|
* This op is similar to a control flow op, it doses not need
|
||||||
|
* a op kernel, but in order to make it execute under dynamic
|
||||||
|
* graph mode, implement it with op kernel.
|
||||||
|
*
|
||||||
|
* So whether the kernel data type is int, float or other type,
|
||||||
|
* which has no effect on its execution logic, so directly
|
||||||
|
* specified a data type here.
|
||||||
|
*
|
||||||
|
* Of course, the data type here is also not important.
|
||||||
|
*/
|
||||||
|
framework::OpKernelType GetExpectedKernelType(
|
||||||
|
const framework::ExecutionContext& ctx) const override {
|
||||||
|
return framework::OpKernelType(framework::proto::VarType::FP32,
|
||||||
|
ctx.GetPlace());
|
||||||
|
}
|
||||||
|
|
||||||
|
framework::OpKernelType GetKernelTypeForVar(
|
||||||
|
const std::string& var_name, const framework::Tensor& tensor,
|
||||||
|
const framework::OpKernelType& expected_kernel_type) const override {
|
||||||
|
return expected_kernel_type;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||||
|
public:
|
||||||
|
void Make() override {
|
||||||
|
AddInput("X",
|
||||||
|
"(vector<LoDTensor>)"
|
||||||
|
"The input tensors of RunProgram operator, also the feed targets "
|
||||||
|
"of loaded program.")
|
||||||
|
.AsDuplicable();
|
||||||
|
AddInput("Params",
|
||||||
|
"(vector<LoDTensor or SelecetedRows>)"
|
||||||
|
"The input parameter of RunProgram operator, also the parameters "
|
||||||
|
"of the loaded program.")
|
||||||
|
.AsDuplicable();
|
||||||
|
AddOutput("Out",
|
||||||
|
"(vector<LoDTensor>)"
|
||||||
|
"The output tensors of RunProgram operator, also the fetch "
|
||||||
|
"targets of the loaded program.")
|
||||||
|
.AsDuplicable();
|
||||||
|
AddOutput("OutScope",
|
||||||
|
"(StepScopeVar)"
|
||||||
|
"A vector of execution scope in RunProgram operator, which "
|
||||||
|
"contains at most one scope."
|
||||||
|
"NOTE: Do not use Scope directly because Scope output is not "
|
||||||
|
"currently supported.");
|
||||||
|
AddAttr<BlockDesc*>("global_block",
|
||||||
|
"(BlockDesc *)"
|
||||||
|
"The global block of executed program desc.");
|
||||||
|
AddAttr<int64_t>("start_op_index",
|
||||||
|
"(int64_t)"
|
||||||
|
"The index of the op to start execution");
|
||||||
|
AddAttr<int64_t>("end_op_index",
|
||||||
|
"(int64_t)"
|
||||||
|
"The index of the op to stop execution");
|
||||||
|
AddAttr<bool>("is_test",
|
||||||
|
"(bool, default false) Set to true for inference only, false "
|
||||||
|
"for training.")
|
||||||
|
.SetDefault(false);
|
||||||
|
AddComment(R"DOC(
|
||||||
|
RunProgram operator.
|
||||||
|
|
||||||
|
The RunProgram operator receives a program's feed targets, fetch targets,
|
||||||
|
and parameters, and receives the forward and backward program desc
|
||||||
|
as attributes, and then executes the program by executor.
|
||||||
|
|
||||||
|
NOTE: This operator is added so that the inference model stored by
|
||||||
|
`fluid.io.save_inference_model` under the static graph mode can be loaded
|
||||||
|
under the dynamic graph mode for fine-tuning or inferencing.
|
||||||
|
|
||||||
|
)DOC");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RunProgramGradOp : public framework::OperatorWithKernel {
|
||||||
|
public:
|
||||||
|
using framework::OperatorWithKernel::OperatorWithKernel;
|
||||||
|
|
||||||
|
void InferShape(framework::InferShapeContext* ctx) const override {
|
||||||
|
PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
|
||||||
|
platform::errors::NotFound(
|
||||||
|
"Input(X) of RunProgramGradOp should not be null."));
|
||||||
|
PADDLE_ENFORCE_EQ(
|
||||||
|
ctx->HasInputs("Params"), true,
|
||||||
|
platform::errors::NotFound(
|
||||||
|
"Input(Params) of RunProgramGradOp should not be null."));
|
||||||
|
PADDLE_ENFORCE_EQ(
|
||||||
|
ctx->HasInputs(framework::GradVarName("Out")), true,
|
||||||
|
platform::errors::NotFound(
|
||||||
|
"Input(Out@GRAD) of RunProgramGradOp should not be null."));
|
||||||
|
// NOTE: The X@GRAD and Params@GRAD may not exist,
|
||||||
|
// because they can be set stop_gradient = True
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/* see [Why use single type kernel] */
|
||||||
|
framework::OpKernelType GetExpectedKernelType(
|
||||||
|
const framework::ExecutionContext& ctx) const override {
|
||||||
|
return framework::OpKernelType(framework::proto::VarType::FP32,
|
||||||
|
ctx.GetPlace());
|
||||||
|
}
|
||||||
|
|
||||||
|
framework::OpKernelType GetKernelTypeForVar(
|
||||||
|
const std::string& var_name, const framework::Tensor& tensor,
|
||||||
|
const framework::OpKernelType& expected_kernel_type) const override {
|
||||||
|
return expected_kernel_type;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
|
||||||
|
public:
|
||||||
|
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void Apply(GradOpPtr<T> grad_op) const override {
|
||||||
|
grad_op->SetType("run_program_grad");
|
||||||
|
grad_op->SetInput("X", this->Input("X"));
|
||||||
|
grad_op->SetInput("Params", this->Input("Params"));
|
||||||
|
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
|
||||||
|
grad_op->SetInput("OutScope", this->Output("OutScope"));
|
||||||
|
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
|
||||||
|
grad_op->SetOutput(framework::GradVarName("Params"),
|
||||||
|
this->InputGrad("Params"));
|
||||||
|
grad_op->SetAttrMap(this->Attrs());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace operators
|
||||||
|
} // namespace paddle
|
||||||
|
|
||||||
|
namespace ops = paddle::operators;
|
||||||
|
REGISTER_OPERATOR(run_program, ops::RunProgramOp, ops::RunProgramOpMaker,
|
||||||
|
ops::RunProgramGradOpMaker<paddle::framework::OpDesc>,
|
||||||
|
ops::RunProgramGradOpMaker<paddle::imperative::OpBase>);
|
||||||
|
REGISTER_OPERATOR(run_program_grad, ops::RunProgramGradOp);
|
||||||
|
|
||||||
|
/* see [Why use single type kernel] */
|
||||||
|
REGISTER_OP_CPU_KERNEL(
|
||||||
|
run_program,
|
||||||
|
ops::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float>)
|
||||||
|
REGISTER_OP_CPU_KERNEL(
|
||||||
|
run_program_grad,
|
||||||
|
ops::RunProgramGradOpKernel<paddle::platform::CPUDeviceContext, float>)
|
@ -0,0 +1,28 @@
|
|||||||
|
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include "paddle/fluid/operators/run_program_op.h"
|
||||||
|
|
||||||
|
#include "paddle/fluid/platform/float16.h"
|
||||||
|
|
||||||
|
namespace ops = paddle::operators;
|
||||||
|
namespace plat = paddle::platform;
|
||||||
|
|
||||||
|
/* see [Why use single type kernel] */
|
||||||
|
REGISTER_OP_CUDA_KERNEL(
|
||||||
|
run_program,
|
||||||
|
ops::RunProgramOpKernel<paddle::platform::CUDADeviceContext, float>);
|
||||||
|
REGISTER_OP_CUDA_KERNEL(
|
||||||
|
run_program_grad,
|
||||||
|
ops::RunProgramGradOpKernel<paddle::platform::CUDADeviceContext, float>);
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,235 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import numpy as np
|
||||||
|
import six
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from paddle.fluid import core
|
||||||
|
from test_imperative_base import new_program_scope
|
||||||
|
|
||||||
|
import paddle.fluid.transpiler.details.program_utils as pu
|
||||||
|
|
||||||
|
|
||||||
|
def while_softmax_regression(img):
|
||||||
|
def cond(i, times, pred):
|
||||||
|
return i < times
|
||||||
|
|
||||||
|
def body(i, times, pred):
|
||||||
|
pred = fluid.layers.fc(input=pred, size=10, act='softmax')
|
||||||
|
i = i + 1
|
||||||
|
return [i, times, pred]
|
||||||
|
|
||||||
|
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
|
||||||
|
times = fluid.layers.fill_constant(shape=[1], dtype='int64', value=5)
|
||||||
|
pred = fluid.layers.fc(input=img, size=10, act='softmax')
|
||||||
|
i, times, pred = fluid.layers.while_loop(
|
||||||
|
cond=cond, body=body, loop_vars=[i, times, pred])
|
||||||
|
return pred
|
||||||
|
|
||||||
|
|
||||||
|
class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.seed = 90
|
||||||
|
self.batch_size = 32
|
||||||
|
self.batch_num = 50
|
||||||
|
self.save_dirname = "while.inference.model"
|
||||||
|
self.model_filename = None
|
||||||
|
self.params_filename = None
|
||||||
|
|
||||||
|
def _random_batch_reader(self):
|
||||||
|
def _get_random_images_and_labels(image_shape, label_shape):
|
||||||
|
image = np.random.random(size=image_shape).astype('float32')
|
||||||
|
label = np.random.random(size=label_shape).astype('int64')
|
||||||
|
return image, label
|
||||||
|
|
||||||
|
def __reader__():
|
||||||
|
for _ in range(self.batch_num):
|
||||||
|
batch_image, batch_label = _get_random_images_and_labels(
|
||||||
|
[self.batch_size, 784], [self.batch_size, 1])
|
||||||
|
yield batch_image, batch_label
|
||||||
|
|
||||||
|
return __reader__
|
||||||
|
|
||||||
|
def train_and_save_model(self):
|
||||||
|
startup_program = fluid.default_startup_program()
|
||||||
|
main_program = fluid.default_main_program()
|
||||||
|
|
||||||
|
img = fluid.data(name='img', shape=[None, 784], dtype='float32')
|
||||||
|
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
|
||||||
|
|
||||||
|
pred = while_softmax_regression(img)
|
||||||
|
|
||||||
|
loss = fluid.layers.cross_entropy(input=pred, label=label)
|
||||||
|
avg_loss = fluid.layers.mean(loss)
|
||||||
|
|
||||||
|
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
|
||||||
|
optimizer.minimize(avg_loss)
|
||||||
|
|
||||||
|
# pu.program_to_code(main_program, skip_op_callstack=True)
|
||||||
|
|
||||||
|
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
||||||
|
) else fluid.CPUPlace()
|
||||||
|
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
exe.run(startup_program)
|
||||||
|
|
||||||
|
loader = fluid.io.DataLoader.from_generator(
|
||||||
|
feed_list=[img, label], capacity=5, iterable=True)
|
||||||
|
loader.set_batch_generator(self._random_batch_reader(), places=place)
|
||||||
|
|
||||||
|
for data in loader():
|
||||||
|
exe.run(main_program, feed=data, fetch_list=[avg_loss])
|
||||||
|
|
||||||
|
fluid.io.save_inference_model(
|
||||||
|
self.save_dirname, ["img"], [pred],
|
||||||
|
exe,
|
||||||
|
model_filename=self.model_filename,
|
||||||
|
params_filename=self.params_filename)
|
||||||
|
|
||||||
|
def load_and_train_dygraph(self):
|
||||||
|
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
||||||
|
) else fluid.CPUPlace()
|
||||||
|
with fluid.dygraph.guard(place):
|
||||||
|
fluid.default_startup_program().random_seed = self.seed
|
||||||
|
fluid.default_main_program().random_seed = self.seed
|
||||||
|
np.random.seed(self.seed)
|
||||||
|
|
||||||
|
backward_strategy = fluid.dygraph.BackwardStrategy()
|
||||||
|
backward_strategy.sort_sum_gradient = True
|
||||||
|
|
||||||
|
while_net = fluid.dygraph.static_runner.StaticModelRunner(
|
||||||
|
self.save_dirname)
|
||||||
|
|
||||||
|
dy_param_init_value = {}
|
||||||
|
for param in while_net.parameters():
|
||||||
|
dy_param_init_value[param.name] = param.numpy()
|
||||||
|
|
||||||
|
sgd = fluid.optimizer.SGD(learning_rate=0.001,
|
||||||
|
parameter_list=while_net.parameters())
|
||||||
|
|
||||||
|
train_loader = fluid.io.DataLoader.from_generator(capacity=10)
|
||||||
|
train_loader.set_batch_generator(
|
||||||
|
self._random_batch_reader(), places=place)
|
||||||
|
|
||||||
|
while_net.train()
|
||||||
|
|
||||||
|
for data in train_loader():
|
||||||
|
img = data[0]
|
||||||
|
label = data[1]
|
||||||
|
label.stop_gradient = True
|
||||||
|
|
||||||
|
cost = while_net(inputs=img)
|
||||||
|
|
||||||
|
loss = fluid.layers.cross_entropy(cost, label)
|
||||||
|
avg_loss = fluid.layers.mean(loss)
|
||||||
|
|
||||||
|
avg_loss.backward(backward_strategy)
|
||||||
|
sgd.minimize(avg_loss)
|
||||||
|
while_net.clear_gradients()
|
||||||
|
|
||||||
|
dy_out = avg_loss.numpy()
|
||||||
|
dy_param_value = {}
|
||||||
|
for param in while_net.parameters():
|
||||||
|
dy_param_value[param.name] = param.numpy()
|
||||||
|
|
||||||
|
return dy_out, dy_param_init_value, dy_param_value
|
||||||
|
|
||||||
|
def load_and_train_static(self):
|
||||||
|
with new_program_scope():
|
||||||
|
fluid.default_startup_program().random_seed = self.seed
|
||||||
|
fluid.default_main_program().random_seed = self.seed
|
||||||
|
np.random.seed(self.seed)
|
||||||
|
|
||||||
|
img = fluid.data(name='img', shape=[None, 784], dtype='float32')
|
||||||
|
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
|
||||||
|
|
||||||
|
pred = while_softmax_regression(img)
|
||||||
|
|
||||||
|
loss = fluid.layers.cross_entropy(input=pred, label=label)
|
||||||
|
avg_loss = fluid.layers.mean(loss)
|
||||||
|
|
||||||
|
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
|
||||||
|
optimizer.minimize(avg_loss)
|
||||||
|
|
||||||
|
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
|
||||||
|
) else fluid.CPUPlace()
|
||||||
|
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
fluid.io.load_params(
|
||||||
|
exe,
|
||||||
|
self.save_dirname,
|
||||||
|
main_program=fluid.default_main_program(),
|
||||||
|
filename=self.params_filename)
|
||||||
|
|
||||||
|
static_param_init_value = {}
|
||||||
|
static_param_name_list = []
|
||||||
|
for param in fluid.default_main_program().all_parameters():
|
||||||
|
static_param_name_list.append(param.name)
|
||||||
|
static_param_init_value[param.name] = fluid.executor._fetch_var(
|
||||||
|
param.name)
|
||||||
|
|
||||||
|
loader = fluid.io.DataLoader.from_generator(
|
||||||
|
feed_list=[img, label], capacity=5, iterable=True)
|
||||||
|
loader.set_batch_generator(
|
||||||
|
self._random_batch_reader(), places=place)
|
||||||
|
|
||||||
|
for data in loader():
|
||||||
|
fetch_list = [avg_loss.name]
|
||||||
|
fetch_list.extend(static_param_name_list)
|
||||||
|
|
||||||
|
out = exe.run(fluid.default_main_program(),
|
||||||
|
feed=data,
|
||||||
|
fetch_list=[avg_loss])
|
||||||
|
|
||||||
|
static_param_value = {}
|
||||||
|
static_out = out[0]
|
||||||
|
for i in range(1, len(out)):
|
||||||
|
static_param_value[static_param_name_list[i - 1]] = out[i]
|
||||||
|
|
||||||
|
return static_out, static_param_init_value, static_param_value
|
||||||
|
|
||||||
|
def test_while_no_params_filename(self):
|
||||||
|
# Phase 1. run and save static model
|
||||||
|
self.train_and_save_model()
|
||||||
|
|
||||||
|
# # Phase 2. load model & train dygraph
|
||||||
|
dy_out, dy_param_init_value, dy_param_value = \
|
||||||
|
self.load_and_train_dygraph()
|
||||||
|
|
||||||
|
static_out, static_param_init_value, static_param_value = \
|
||||||
|
self.load_and_train_static()
|
||||||
|
|
||||||
|
# Phase 3. compare
|
||||||
|
for key, value in six.iteritems(static_param_init_value):
|
||||||
|
key += core.loaded_var_suffix()
|
||||||
|
self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
|
||||||
|
|
||||||
|
self.assertTrue(np.allclose(static_out, dy_out))
|
||||||
|
|
||||||
|
for key, value in six.iteritems(static_param_value):
|
||||||
|
key += core.loaded_var_suffix()
|
||||||
|
self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue