Add variant of new load and save ops for storing model params in a single file (#7909)
* Add save_combine_op * Add load_combine_op and test * Add unit-test * Add a delete to free buffer memory * Add new variant of load/save * Fix unit-test * Add another unit test for compatibility with original save/load * Address review comments and simplify logic * Address review comments and simplify code - part 2 * Fix naming issues and CMake problems * Address review comments * Fix LoD information in tests * Address review comments: round 2emailweixu-patch-1
parent
fbd5f689bd
commit
2e907c3613
@ -0,0 +1,108 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
#include <fstream>
|
||||
|
||||
#include "paddle/framework/op_registry.h"
|
||||
#include "paddle/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
class LoadCombineOp : public framework::OperatorBase {
|
||||
public:
|
||||
LoadCombineOp(const std::string &type,
|
||||
const framework::VariableNameMap &inputs,
|
||||
const framework::VariableNameMap &outputs,
|
||||
const framework::AttributeMap &attrs)
|
||||
: OperatorBase(type, inputs, outputs, attrs) {}
|
||||
void Run(const framework::Scope &scope,
|
||||
const platform::Place &place) const override {
|
||||
auto filename = Attr<std::string>("file_path");
|
||||
|
||||
std::ifstream fin(filename);
|
||||
PADDLE_ENFORCE(static_cast<bool>(fin),
|
||||
"Cannot open file %s for load_combine op", filename);
|
||||
|
||||
auto out_var_names = Outputs("Out");
|
||||
PADDLE_ENFORCE_GT(
|
||||
static_cast<int>(out_var_names.size()), 0,
|
||||
"The number of output variables should be greater than 0.");
|
||||
|
||||
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
|
||||
auto &dev_ctx = *pool.Get(place);
|
||||
|
||||
for (size_t i = 0; i < out_var_names.size(); i++) {
|
||||
auto *out_var = scope.FindVar(out_var_names[i]);
|
||||
|
||||
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
|
||||
out_var_names[i]);
|
||||
|
||||
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
|
||||
|
||||
// Error checking
|
||||
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
|
||||
filename);
|
||||
|
||||
// Get data from fin to tensor
|
||||
DeserializeFromStream(fin, tensor, dev_ctx);
|
||||
|
||||
if (platform::is_gpu_place(place)) {
|
||||
// copy CPU to GPU
|
||||
framework::LoDTensor cpu_tensor;
|
||||
cpu_tensor.ShareDataWith(*tensor);
|
||||
cpu_tensor.set_lod(tensor->lod());
|
||||
|
||||
// reset tensor
|
||||
out_var->Clear();
|
||||
tensor = out_var->GetMutable<framework::LoDTensor>();
|
||||
tensor->set_lod(cpu_tensor.lod());
|
||||
Copy(cpu_tensor, place, dev_ctx, tensor);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
|
||||
: OpProtoAndCheckerMaker(proto, op_checker) {
|
||||
AddOutput(
|
||||
"Out",
|
||||
"(vector) The output LoDTensors that will be read from the input file.")
|
||||
.AsDuplicable();
|
||||
AddAttr<std::string>("file_path",
|
||||
"(string) "
|
||||
"LoDTensors will be loaded from \"file_path\".")
|
||||
.AddCustomChecker(
|
||||
[](const std::string &path) { return !path.empty(); });
|
||||
AddComment(R"DOC(
|
||||
LoadCombine Operator.
|
||||
|
||||
LoadCombine operator loads LoDTensor variables from a file. The file should
|
||||
contain one or more LoDTensors serialized using the SaveCombine operator. The
|
||||
LoadCombine operator applies a deserialization strategy to appropriately load
|
||||
the LodTensors, and this strategy complements the serialization strategy used
|
||||
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
|
||||
with the SaveCombine operator, and can only deserialize one or more LoDTensors
|
||||
that were saved using the SaveCombine operator.
|
||||
|
||||
)DOC");
|
||||
}
|
||||
};
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
namespace ops = paddle::operators;
|
||||
|
||||
REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
|
||||
ops::LoadCombineOpProtoMaker);
|
@ -0,0 +1,141 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include <stdint.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fstream>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include "paddle/framework/data_type.h"
|
||||
#include "paddle/framework/framework.pb.h"
|
||||
#include "paddle/framework/lod_tensor.h"
|
||||
#include "paddle/framework/op_registry.h"
|
||||
#include "paddle/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
// TODO(sidgoyal78): These function are needed by other files (save_op), move
|
||||
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
|
||||
constexpr char kSEP = '/';
|
||||
static bool FileExists(const std::string &filepath) {
|
||||
struct stat buffer;
|
||||
return (stat(filepath.c_str(), &buffer) == 0);
|
||||
}
|
||||
|
||||
static std::string DirName(const std::string &filepath) {
|
||||
auto pos = filepath.rfind(kSEP);
|
||||
if (pos == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
return filepath.substr(0, pos);
|
||||
}
|
||||
|
||||
static void MkDir(const char *path) {
|
||||
if (mkdir(path, 0755)) {
|
||||
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
|
||||
}
|
||||
}
|
||||
|
||||
static void MkDirRecursively(const char *fullpath) {
|
||||
if (*fullpath == '\0') return; // empty string
|
||||
if (FileExists(fullpath)) return;
|
||||
|
||||
MkDirRecursively(DirName(fullpath).c_str());
|
||||
MkDir(fullpath);
|
||||
}
|
||||
|
||||
class SaveCombineOp : public framework::OperatorBase {
|
||||
public:
|
||||
SaveCombineOp(const std::string &type,
|
||||
const framework::VariableNameMap &inputs,
|
||||
const framework::VariableNameMap &outputs,
|
||||
const framework::AttributeMap &attrs)
|
||||
: OperatorBase(type, inputs, outputs, attrs) {}
|
||||
void Run(const framework::Scope &scope,
|
||||
const platform::Place &place) const override {
|
||||
auto filename = Attr<std::string>("file_path");
|
||||
auto overwrite = Attr<bool>("overwrite");
|
||||
|
||||
bool is_present = FileExists(filename);
|
||||
if (is_present && !overwrite) {
|
||||
PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
|
||||
filename, overwrite);
|
||||
}
|
||||
|
||||
MkDirRecursively(DirName(filename).c_str());
|
||||
std::ofstream fout(filename);
|
||||
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
|
||||
filename);
|
||||
|
||||
auto inp_var_names = Inputs("X");
|
||||
PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
|
||||
"The number of input variables should be greater than 0");
|
||||
|
||||
// get device context from pool
|
||||
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
|
||||
auto &dev_ctx = *pool.Get(place);
|
||||
|
||||
for (size_t i = 0; i < inp_var_names.size(); i++) {
|
||||
auto *var = scope.FindVar(inp_var_names[i]);
|
||||
|
||||
PADDLE_ENFORCE(var != nullptr,
|
||||
"Cannot find variable %s for save_combine_op",
|
||||
inp_var_names[i]);
|
||||
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
|
||||
"SaveCombineOp only supports LoDTensor, %s has wrong type",
|
||||
inp_var_names[i]);
|
||||
|
||||
auto &tensor = var->Get<framework::LoDTensor>();
|
||||
// Serialize tensor
|
||||
framework::SerializeToStream(fout, tensor, dev_ctx);
|
||||
}
|
||||
fout.close();
|
||||
}
|
||||
};
|
||||
|
||||
class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
|
||||
: OpProtoAndCheckerMaker(proto, op_checker) {
|
||||
AddInput(
|
||||
"X",
|
||||
"(vector) Input LoDTensors that need to be saved together in a file.")
|
||||
.AsDuplicable();
|
||||
AddComment(R"DOC(
|
||||
SaveCombine operator
|
||||
|
||||
This operator will serialize and write a list of input LoDTensor variables
|
||||
to a file on disk.
|
||||
)DOC");
|
||||
AddAttr<bool>("overwrite",
|
||||
"(boolean, default true)"
|
||||
"Overwrite the output file if it exists.")
|
||||
.SetDefault(true);
|
||||
AddAttr<std::string>(
|
||||
"file_path",
|
||||
"(string)"
|
||||
"The \"file_path\" where the LoDTensor variables will be saved.")
|
||||
.AddCustomChecker(
|
||||
[](const std::string &path) { return !path.empty(); });
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
|
||||
REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
|
||||
ops::SaveCombineOpProtoMaker);
|
@ -0,0 +1,180 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "gtest/gtest.h"
|
||||
#include "paddle/framework/op_registry.h"
|
||||
|
||||
USE_NO_KERNEL_OP(save_combine);
|
||||
USE_NO_KERNEL_OP(load_combine);
|
||||
|
||||
int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
|
||||
std::string var_name,
|
||||
paddle::platform::CPUPlace& place,
|
||||
paddle::framework::Scope& scope,
|
||||
paddle::framework::LoD& expect_lod) {
|
||||
auto var = scope.Var(var_name);
|
||||
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
|
||||
tensor->Resize({x, y});
|
||||
expect_lod.resize(1);
|
||||
for (size_t i = 0; i < lod_info.size(); i++) {
|
||||
expect_lod[0].push_back(lod_info[i]);
|
||||
}
|
||||
tensor->set_lod(expect_lod);
|
||||
int* expect = tensor->mutable_data<int>(place);
|
||||
for (int64_t i = 0; i < tensor->numel(); ++i) {
|
||||
expect[i] = static_cast<int>(i);
|
||||
}
|
||||
return expect;
|
||||
}
|
||||
|
||||
paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
|
||||
const std::string out_var_name, paddle::framework::Scope& scope) {
|
||||
auto load_var = scope.Var(out_var_name);
|
||||
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
|
||||
return target;
|
||||
}
|
||||
|
||||
int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
|
||||
paddle::framework::Scope& scope,
|
||||
paddle::framework::LoD& actual_lod) {
|
||||
int* actual = target->data<int>();
|
||||
actual_lod = target->lod();
|
||||
return actual;
|
||||
}
|
||||
|
||||
void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
|
||||
paddle::framework::LoD actual_lod, const int& numel) {
|
||||
for (int64_t i = 0; i < numel; ++i) {
|
||||
EXPECT_EQ(expect[i], actual[i]);
|
||||
}
|
||||
EXPECT_EQ(expect_lod.size(), actual_lod.size());
|
||||
for (size_t i = 0; i < expect_lod.size(); ++i) {
|
||||
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
|
||||
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Here, we create 4 LoDTensors and use save_combine_op to first save these
|
||||
// in a single file. Then, we use load_combine_op to load these sequentially
|
||||
TEST(SaveLoadCombineOp, CPU) {
|
||||
paddle::framework::Scope scope;
|
||||
paddle::platform::CPUPlace place;
|
||||
|
||||
std::vector<int> lod1 = {0, 1, 2, 3, 10};
|
||||
int numel1 = 100;
|
||||
paddle::framework::LoD expect_lod1;
|
||||
int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
|
||||
expect_lod1);
|
||||
|
||||
std::vector<int> lod2 = {0, 2, 5, 10};
|
||||
int numel2 = 200;
|
||||
paddle::framework::LoD expect_lod2;
|
||||
int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
|
||||
expect_lod2);
|
||||
|
||||
std::vector<int> lod3 = {0, 2, 3, 20};
|
||||
int numel3 = 4000;
|
||||
paddle::framework::LoD expect_lod3;
|
||||
int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
|
||||
scope, expect_lod3);
|
||||
|
||||
std::vector<int> lod4 = {0, 1, 20};
|
||||
int numel4 = 1000;
|
||||
paddle::framework::LoD expect_lod4;
|
||||
int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
|
||||
expect_lod4);
|
||||
|
||||
// Set attributes
|
||||
std::string filename = "check_tensor.ls";
|
||||
paddle::framework::AttributeMap attrs;
|
||||
attrs.insert({"file_path", std::string(filename)});
|
||||
|
||||
// Run the save_combine_op
|
||||
auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
|
||||
"save_combine",
|
||||
{{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
|
||||
save_combine_op->Run(scope, place);
|
||||
|
||||
// Set up output vars
|
||||
auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
|
||||
auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
|
||||
auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
|
||||
auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
|
||||
|
||||
// Run the load_combine_op
|
||||
auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
|
||||
"load_combine", {},
|
||||
{{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
|
||||
load_combine_op->Run(scope, place);
|
||||
|
||||
paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
|
||||
int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
|
||||
int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
|
||||
int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
|
||||
int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
|
||||
|
||||
CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
|
||||
CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
|
||||
CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
|
||||
CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
|
||||
}
|
||||
|
||||
// Test with original SaveLoadTest
|
||||
TEST(SaveLoadTestWithCombineOp, CPU) {
|
||||
paddle::framework::Scope scope;
|
||||
paddle::platform::CPUPlace place;
|
||||
|
||||
auto var = scope.Var("test_var");
|
||||
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
|
||||
tensor->Resize({3, 10});
|
||||
paddle::framework::LoD expect_lod;
|
||||
expect_lod.resize(1);
|
||||
expect_lod[0].push_back(0);
|
||||
expect_lod[0].push_back(1);
|
||||
expect_lod[0].push_back(2);
|
||||
expect_lod[0].push_back(3);
|
||||
|
||||
tensor->set_lod(expect_lod);
|
||||
int* expect = tensor->mutable_data<int>(place);
|
||||
for (int64_t i = 0; i < tensor->numel(); ++i) {
|
||||
expect[i] = static_cast<int>(i);
|
||||
}
|
||||
paddle::framework::AttributeMap attrs;
|
||||
attrs.insert({"file_path", std::string("check_t.save")});
|
||||
|
||||
auto save_op = paddle::framework::OpRegistry::CreateOp(
|
||||
"save_combine", {{"X", {"test_var"}}}, {}, attrs);
|
||||
save_op->Run(scope, place);
|
||||
|
||||
auto load_var = scope.Var("out_var");
|
||||
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
|
||||
auto load_op = paddle::framework::OpRegistry::CreateOp(
|
||||
"load_combine", {}, {{"Out", {"out_var"}}}, attrs);
|
||||
load_op->Run(scope, place);
|
||||
int* actual = target->data<int>();
|
||||
for (int64_t i = 0; i < tensor->numel(); ++i) {
|
||||
EXPECT_EQ(expect[i], actual[i]);
|
||||
}
|
||||
auto& actual_lod = target->lod();
|
||||
EXPECT_EQ(expect_lod.size(), actual_lod.size());
|
||||
for (size_t i = 0; i < expect_lod.size(); ++i) {
|
||||
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
|
||||
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue