"Serialize LoDTensor, Save/Restore model" (#4602)
* "add model format design doc" * "add restore function" * "add parse protobuf" * "move necessary information to saver.proto" * "format code" * "add gpu option" * "add lod info" * "add saveop python test wrapper" * "checkpoint reuse save operator" * "rewrite model format design doc" * "async support needed" * "fix run once" * "fix doc based on comments" * "refine based on comments" * "fix based comments" * "remove persistable flag from framework.proto" * "add IndicateDataType to restore op" * "add save test" * "modify save restore code" * "modified the restore logic" * rm checkpoint_op.cc * rm test_checkpoint_op.py * "get inputs outputs name from execution context" * Saving each variable to a independent file * Fix bugs * Rewrite save_restore_op_test with new Python framework * Move `SaveOp` and `RestoreOp` from OpWithKernel to OpBase * Refine unit test of SaveOp and RestoreOp * fix compile errorwqrevert-4814-Add_sequence_project_op
parent
d78d119346
commit
fd2eb55071
@ -0,0 +1,36 @@
|
||||
# Design Doc: Model Format
|
||||
|
||||
## Motivation
|
||||
|
||||
The model is the output of training process. One complete model consists of two parts, namely, the **topology** and the **parameters**. To support industrial deployment, we need to make the model format must be self-completed and do not expose any training source code.
|
||||
|
||||
As a result, In PaddlePaddle, the **topology** represents as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model, we must support large size parameter, and efficient serialization/deserialization.
|
||||
|
||||
## Implementation
|
||||
|
||||
The topology is saved as a plain text, in detail, a self-contain protobuf file.
|
||||
|
||||
The parameters are saved as a binary file. As we all know, the protobuf message has the limits of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We do a (benchmark experiment)[https://github.com/PaddlePaddle/Paddle/pull/4610], its result shows protobuf is not fit in this scene.
|
||||
|
||||
As a result, we design a particular format for tensor serialization. By default, arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of (LoDTensorDesc)[https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99]. We save the DescProto as the byte string header, it contains the necessary information, such as the `dims`, the `name` of the tensor, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). Tensor stores value in a continuous memory buffer, for speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
|
||||
|
||||
|HeaderLength|ContentLength|**LoDTensorDesc**|**TensorValue**|
|
||||
|
||||
In detail, tensor's byte view as the table shows. Note that all the signed value written in little-endian.
|
||||
|
||||
```text
|
||||
[offset] [type] [description]
|
||||
0004 4 bytes integer HeaderLength, the length of LoDTensorDesc
|
||||
0008 4 bytes integer ContentLength, the length of LodTensor Buffer
|
||||
0009 1 bytes char TensorDesc
|
||||
00010 1 bytes char TensorDesc
|
||||
...
|
||||
00100 1 bytes char TensorValue
|
||||
00101 1 bytes char TensorValue
|
||||
00102 1 bytes char TensorValue ..
|
||||
...
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
We introduce the model format, the `ProgramDesc` describe the **topology**, and a bunch of particular format binary tensors describes the **parameters**.
|
@ -0,0 +1,39 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
syntax = "proto2";
|
||||
option optimize_for = LITE_RUNTIME;
|
||||
package paddle.framework;
|
||||
|
||||
import "framework.proto";
|
||||
|
||||
/**
|
||||
* This file contains necessary information for model, checkpoint.
|
||||
* etc.
|
||||
*/
|
||||
|
||||
message LoDInfo { repeated int64 level = 1; }
|
||||
|
||||
/**
|
||||
* Save the LoDTensorDesc information through LoDTensorProto, its data memory
|
||||
* is copyed to c buffer immediately. See model_format.md for details.
|
||||
*/
|
||||
|
||||
message LoDTensorProto {
|
||||
optional DataType data_type = 1;
|
||||
repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
|
||||
repeated LoDInfo levels = 3;
|
||||
optional int32 lod_level = 4 [ default = 0 ];
|
||||
optional int32 version = 5;
|
||||
}
|
@ -0,0 +1,147 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/framework/eigen.h"
|
||||
#include "paddle/framework/op_registry.h"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
using framework::Tensor;
|
||||
using framework::LoDTensor;
|
||||
|
||||
inline static std::string VarToFileName(const std::string& folder_path,
|
||||
const std::string& var_name) {
|
||||
return folder_path + "/__" + var_name + "__";
|
||||
}
|
||||
|
||||
class SaveOp : public framework::OperatorBase {
|
||||
public:
|
||||
SaveOp(const std::string& type, const framework::VariableNameMap& inputs,
|
||||
const framework::VariableNameMap& outputs,
|
||||
const framework::AttributeMap& attrs)
|
||||
: OperatorBase(type, inputs, outputs, attrs) {}
|
||||
|
||||
void Run(const framework::Scope& scope,
|
||||
const platform::DeviceContext& dev_ctx) const override {
|
||||
const auto& var_names = this->Inputs("X");
|
||||
for (const auto& name : var_names) {
|
||||
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
|
||||
"Can not find variable '%s' in the scope.", name);
|
||||
}
|
||||
std::string folder_path = this->Attr<std::string>("folderPath");
|
||||
PADDLE_ENFORCE(!folder_path.empty(),
|
||||
"'folderPath' of SaveOp shouldn't be empty.");
|
||||
|
||||
VLOG(1) << "Save variables to folder: " << folder_path;
|
||||
for (const auto& name : var_names) {
|
||||
std::string file_name = VarToFileName(folder_path, name);
|
||||
std::ofstream fout(file_name, std::ofstream::out);
|
||||
PADDLE_ENFORCE(fout.is_open(), "Fail to create file %s.", file_name);
|
||||
const LoDTensor& tensor = scope.FindVar(name)->Get<LoDTensor>();
|
||||
std::string bytes = tensor.SerializeToString();
|
||||
fout << bytes;
|
||||
fout.close();
|
||||
}
|
||||
VLOG(1) << "Compelete saving variables. Items count: " << var_names.size();
|
||||
}
|
||||
};
|
||||
|
||||
class SaveOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
SaveOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
|
||||
: OpProtoAndCheckerMaker(proto, op_checker) {
|
||||
AddInput("X",
|
||||
"(tensor), the tensor count can be 1~INT_MAX, tensors names which "
|
||||
"values will be saved.")
|
||||
.AsDuplicable();
|
||||
AddAttr<std::string>("folderPath", "the folderPath for save model.");
|
||||
AddComment(R"DOC(
|
||||
Save the input tensors to a binary file based on input tensor names and absolute path.
|
||||
|
||||
All the inputs can carry the LoD (Level of Details) information,
|
||||
or not.
|
||||
)DOC");
|
||||
}
|
||||
};
|
||||
|
||||
class RestoreOp : public framework::OperatorBase {
|
||||
public:
|
||||
RestoreOp(const std::string& type, const framework::VariableNameMap& inputs,
|
||||
const framework::VariableNameMap& outputs,
|
||||
const framework::AttributeMap& attrs)
|
||||
: OperatorBase(type, inputs, outputs, attrs) {}
|
||||
|
||||
void Run(const framework::Scope& scope,
|
||||
const platform::DeviceContext& dev_ctx) const override {
|
||||
const auto& var_names = this->Outputs("Out");
|
||||
for (const auto& name : var_names) {
|
||||
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
|
||||
"Can not find variable '%s' in the scope.", name);
|
||||
}
|
||||
std::string folder_path = this->Attr<std::string>("folderPath");
|
||||
PADDLE_ENFORCE(!folder_path.empty(),
|
||||
"'folderPath' of RestoreOp shouldn't be empty.");
|
||||
|
||||
VLOG(1) << "Try loading variables from folder: " << folder_path;
|
||||
|
||||
for (const auto& name : var_names) {
|
||||
std::string file_name = VarToFileName(folder_path, name);
|
||||
std::ifstream fin(file_name, std::ifstream::in);
|
||||
PADDLE_ENFORCE(fin.is_open(), "Fail to open file %s.", file_name);
|
||||
const size_t kBufferSize = 4096; // equal to linux page size
|
||||
char buffer[kBufferSize];
|
||||
std::string cache;
|
||||
while (!fin.eof()) {
|
||||
fin.read(buffer, kBufferSize);
|
||||
cache.append(buffer, fin.gcount());
|
||||
}
|
||||
LoDTensor* tensor = scope.FindVar(name)->GetMutable<LoDTensor>();
|
||||
tensor->DeserializeFromString(cache, dev_ctx.GetPlace());
|
||||
fin.close();
|
||||
}
|
||||
VLOG(1) << "Complete loading variables.";
|
||||
}
|
||||
};
|
||||
|
||||
class RestoreOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
RestoreOpMaker(framework::OpProto* proto,
|
||||
framework::OpAttrChecker* op_checker)
|
||||
: OpProtoAndCheckerMaker(proto, op_checker) {
|
||||
AddOutput("Out",
|
||||
"(tensor), the tensor count can be 1~INT_MAX, tensors which "
|
||||
"values will be restores.")
|
||||
.AsDuplicable();
|
||||
AddAttr<std::string>("folderPath", "the folderPath for model file.");
|
||||
AddAttr<int>("data_type", "output tensor data type")
|
||||
.SetDefault(framework::DataType::FP32);
|
||||
AddComment(R"DOC(
|
||||
Restore the tensors from model file based on absolute path.
|
||||
|
||||
All the tensors outputs may carry the LoD (Level of Details) information,
|
||||
or not.
|
||||
)DOC");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
REGISTER_OPERATOR(save, paddle::operators::SaveOp,
|
||||
paddle::framework::EmptyGradOpMaker,
|
||||
paddle::operators::SaveOpMaker);
|
||||
|
||||
REGISTER_OPERATOR(restore, paddle::operators::RestoreOp,
|
||||
paddle::framework::EmptyGradOpMaker,
|
||||
paddle::operators::RestoreOpMaker);
|
@ -0,0 +1,71 @@
|
||||
import paddle.v2.framework.core as core
|
||||
import paddle.v2.framework.framework as framework
|
||||
import paddle.v2.framework.executor as executor
|
||||
|
||||
import numpy as np
|
||||
import unittest
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
FOLDER_PATH = "./tmp_test_dir"
|
||||
|
||||
|
||||
class TestSaveRestoreOp(unittest.TestCase):
|
||||
def test_save_restore_op(self):
|
||||
tensor_1_val = np.random.rand(3, 9).astype("float32")
|
||||
tensor_2_val = np.random.randint(0, 20, size=(4, 2)).astype("int32")
|
||||
place = core.CPUPlace()
|
||||
|
||||
program = framework.Program()
|
||||
block = program.global_block()
|
||||
v_a = block.create_var(
|
||||
dtype="float32", shape=[3, 9], lod_level=0, name="tensor_1")
|
||||
v_b = block.create_var(
|
||||
dtype="int32", shape=[4, 2], lod_level=0, name="tensor_2")
|
||||
|
||||
t_1 = core.LoDTensor()
|
||||
t_1.set(tensor_1_val, place)
|
||||
t_2 = core.LoDTensor()
|
||||
t_2.set(tensor_2_val, place)
|
||||
block.append_op(
|
||||
type="save",
|
||||
inputs={"X": [v_a, v_b]},
|
||||
attrs={"folderPath": FOLDER_PATH})
|
||||
block.append_op(
|
||||
type="fill_constant",
|
||||
outputs={"Out": [v_a]},
|
||||
attrs={"shape": [2, 2],
|
||||
"value": 0.0})
|
||||
block.append_op(
|
||||
type="fill_constant",
|
||||
outputs={"Out": [v_b]},
|
||||
attrs={"shape": [2, 2],
|
||||
"value": 0.0})
|
||||
block.append_op(
|
||||
type="restore",
|
||||
outputs={"Out": [v_a, v_b]},
|
||||
attrs={"folderPath": FOLDER_PATH})
|
||||
|
||||
if os.path.exists(FOLDER_PATH):
|
||||
shutil.rmtree(FOLDER_PATH)
|
||||
os.makedirs(FOLDER_PATH)
|
||||
|
||||
exe = executor.Executor(place)
|
||||
out = exe.run(program,
|
||||
feed={"tensor_1": t_1,
|
||||
"tensor_2": t_2},
|
||||
fetch_list=[v_a, v_b])
|
||||
|
||||
self.assertTrue(os.path.isdir(FOLDER_PATH))
|
||||
self.assertTrue(os.path.isfile(FOLDER_PATH + "/__tensor_1__"))
|
||||
self.assertTrue(os.path.isfile(FOLDER_PATH + "/__tensor_2__"))
|
||||
|
||||
self.assertTrue(np.array_equal(np.array(out[0]), tensor_1_val))
|
||||
self.assertTrue(np.array_equal(np.array(out[1]), tensor_2_val))
|
||||
|
||||
shutil.rmtree(FOLDER_PATH)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Loading…
Reference in new issue