Add variant of new load and save ops for storing model params in a single file (#7909)

* Add save_combine_op * Add load_combine_op and test * Add unit-test * Add a delete to free buffer memory * Add new variant of load/save * Fix unit-test * Add another unit test for compatibility with original save/load * Address review comments and simplify logic * Address review comments and simplify code - part 2 * Fix naming issues and CMake problems * Address review comments * Fix LoD information in tests * Address review comments: round 2
8 years ago · 2e907c3613
parent fbd5f689bd
commit 2e907c3613
5 changed files with 433 additions and 1 deletions
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -173,6 +173,8 @@ endif()
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@ -192,3 +194,4 @@ if(WITH_GPU)
    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
--- a/paddle/operators/load_combine_op.cc
+++ b/paddle/operators/load_combine_op.cc
@ -0,0 +1,108 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
 class LoadCombineOp : public framework::OperatorBase {
 public:
  LoadCombineOp(const std::string &type,
                const framework::VariableNameMap &inputs,
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
           const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    std::ifstream fin(filename);
    PADDLE_ENFORCE(static_cast<bool>(fin),
                   "Cannot open file %s for load_combine op", filename);
    auto out_var_names = Outputs("Out");
    PADDLE_ENFORCE_GT(
        static_cast<int>(out_var_names.size()), 0,
        "The number of output variables should be greater than 0.");
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
    for (size_t i = 0; i < out_var_names.size(); i++) {
      auto *out_var = scope.FindVar(out_var_names[i]);
      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                     out_var_names[i]);
      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
      // Error checking
      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
                     filename);
      // Get data from fin to tensor
      DeserializeFromStream(fin, tensor, dev_ctx);
      if (platform::is_gpu_place(place)) {
        // copy CPU to GPU
        framework::LoDTensor cpu_tensor;
        cpu_tensor.ShareDataWith(*tensor);
        cpu_tensor.set_lod(tensor->lod());
        // reset tensor
        out_var->Clear();
        tensor = out_var->GetMutable<framework::LoDTensor>();
        tensor->set_lod(cpu_tensor.lod());
        Copy(cpu_tensor, place, dev_ctx, tensor);
      }
    }
  }
 };
 class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput(
        "Out",
        "(vector) The output LoDTensors that will be read from the input file.")
        .AsDuplicable();
    AddAttr<std::string>("file_path",
                         "(string) "
                         "LoDTensors will be loaded from \"file_path\".")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
    AddComment(R"DOC(
 LoadCombine Operator.
 LoadCombine operator loads LoDTensor variables from a file. The file should 
 contain one or more LoDTensors serialized using the SaveCombine operator. The 
 LoadCombine operator applies a deserialization strategy to appropriately load 
 the LodTensors, and this strategy complements the serialization strategy used 
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
 with the SaveCombine operator, and can only deserialize one or more LoDTensors 
 that were saved using the SaveCombine operator.
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
                  ops::LoadCombineOpProtoMaker);
--- a/paddle/operators/save_combine_op.cc
+++ b/paddle/operators/save_combine_op.cc
@ -0,0 +1,141 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <stdint.h>
 #include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 #include <sstream>
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
 // TODO(sidgoyal78): These function are needed by other files (save_op), move
 // them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
 constexpr char kSEP = '/';
 static bool FileExists(const std::string &filepath) {
  struct stat buffer;
  return (stat(filepath.c_str(), &buffer) == 0);
 }
 static std::string DirName(const std::string &filepath) {
  auto pos = filepath.rfind(kSEP);
  if (pos == std::string::npos) {
    return "";
  }
  return filepath.substr(0, pos);
 }
 static void MkDir(const char *path) {
  if (mkdir(path, 0755)) {
    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
  }
 }
 static void MkDirRecursively(const char *fullpath) {
  if (*fullpath == '\0') return;  // empty string
  if (FileExists(fullpath)) return;
  MkDirRecursively(DirName(fullpath).c_str());
  MkDir(fullpath);
 }
 class SaveCombineOp : public framework::OperatorBase {
 public:
  SaveCombineOp(const std::string &type,
                const framework::VariableNameMap &inputs,
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
           const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    auto overwrite = Attr<bool>("overwrite");
    bool is_present = FileExists(filename);
    if (is_present && !overwrite) {
      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
                   filename, overwrite);
    }
    MkDirRecursively(DirName(filename).c_str());
    std::ofstream fout(filename);
    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                   filename);
    auto inp_var_names = Inputs("X");
    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
                      "The number of input variables should be greater than 0");
    // get device context from pool
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
    for (size_t i = 0; i < inp_var_names.size(); i++) {
      auto *var = scope.FindVar(inp_var_names[i]);
      PADDLE_ENFORCE(var != nullptr,
                     "Cannot find variable %s for save_combine_op",
                     inp_var_names[i]);
      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
                     inp_var_names[i]);
      auto &tensor = var->Get<framework::LoDTensor>();
      // Serialize tensor
      framework::SerializeToStream(fout, tensor, dev_ctx);
    }
    fout.close();
  }
 };
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",
        "(vector) Input LoDTensors that need to be saved together in a file.")
        .AsDuplicable();
    AddComment(R"DOC(
 SaveCombine operator
 This operator will serialize and write a list of input LoDTensor variables 
 to a file on disk.
 )DOC");
    AddAttr<bool>("overwrite",
                  "(boolean, default true)"
                  "Overwrite the output file if it exists.")
        .SetDefault(true);
    AddAttr<std::string>(
        "file_path",
        "(string)"
        "The \"file_path\" where the LoDTensor variables will be saved.")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
                  ops::SaveCombineOpProtoMaker);
--- a/paddle/operators/save_load_combine_op_test.cc
+++ b/paddle/operators/save_load_combine_op_test.cc
@ -0,0 +1,180 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <iostream>
 #include <string>
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/framework/op_registry.h"
 USE_NO_KERNEL_OP(save_combine);
 USE_NO_KERNEL_OP(load_combine);
 int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
                            std::string var_name,
                            paddle::platform::CPUPlace& place,
                            paddle::framework::Scope& scope,
                            paddle::framework::LoD& expect_lod) {
  auto var = scope.Var(var_name);
  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
  tensor->Resize({x, y});
  expect_lod.resize(1);
  for (size_t i = 0; i < lod_info.size(); i++) {
    expect_lod[0].push_back(lod_info[i]);
  }
  tensor->set_lod(expect_lod);
  int* expect = tensor->mutable_data<int>(place);
  for (int64_t i = 0; i < tensor->numel(); ++i) {
    expect[i] = static_cast<int>(i);
  }
  return expect;
 }
 paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
    const std::string out_var_name, paddle::framework::Scope& scope) {
  auto load_var = scope.Var(out_var_name);
  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
  return target;
 }
 int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
                                 paddle::framework::Scope& scope,
                                 paddle::framework::LoD& actual_lod) {
  int* actual = target->data<int>();
  actual_lod = target->lod();
  return actual;
 }
 void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
                 paddle::framework::LoD actual_lod, const int& numel) {
  for (int64_t i = 0; i < numel; ++i) {
    EXPECT_EQ(expect[i], actual[i]);
  }
  EXPECT_EQ(expect_lod.size(), actual_lod.size());
  for (size_t i = 0; i < expect_lod.size(); ++i) {
    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
    }
  }
 }
 // Here, we create 4 LoDTensors and use save_combine_op to first save these
 // in a single file. Then, we use load_combine_op to load these sequentially
 TEST(SaveLoadCombineOp, CPU) {
  paddle::framework::Scope scope;
  paddle::platform::CPUPlace place;
  std::vector<int> lod1 = {0, 1, 2, 3, 10};
  int numel1 = 100;
  paddle::framework::LoD expect_lod1;
  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
                                        expect_lod1);
  std::vector<int> lod2 = {0, 2, 5, 10};
  int numel2 = 200;
  paddle::framework::LoD expect_lod2;
  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
                                        expect_lod2);
  std::vector<int> lod3 = {0, 2, 3, 20};
  int numel3 = 4000;
  paddle::framework::LoD expect_lod3;
  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
                                        scope, expect_lod3);
  std::vector<int> lod4 = {0, 1, 20};
  int numel4 = 1000;
  paddle::framework::LoD expect_lod4;
  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
                                        expect_lod4);
  // Set attributes
  std::string filename = "check_tensor.ls";
  paddle::framework::AttributeMap attrs;
  attrs.insert({"file_path", std::string(filename)});
  // Run the save_combine_op
  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
      "save_combine",
      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
  save_combine_op->Run(scope, place);
  // Set up output vars
  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
  // Run the load_combine_op
  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
      "load_combine", {},
      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
  load_combine_op->Run(scope, place);
  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
 }
 // Test with original SaveLoadTest
 TEST(SaveLoadTestWithCombineOp, CPU) {
  paddle::framework::Scope scope;
  paddle::platform::CPUPlace place;
  auto var = scope.Var("test_var");
  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
  tensor->Resize({3, 10});
  paddle::framework::LoD expect_lod;
  expect_lod.resize(1);
  expect_lod[0].push_back(0);
  expect_lod[0].push_back(1);
  expect_lod[0].push_back(2);
  expect_lod[0].push_back(3);
  tensor->set_lod(expect_lod);
  int* expect = tensor->mutable_data<int>(place);
  for (int64_t i = 0; i < tensor->numel(); ++i) {
    expect[i] = static_cast<int>(i);
  }
  paddle::framework::AttributeMap attrs;
  attrs.insert({"file_path", std::string("check_t.save")});
  auto save_op = paddle::framework::OpRegistry::CreateOp(
      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
  save_op->Run(scope, place);
  auto load_var = scope.Var("out_var");
  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
  auto load_op = paddle::framework::OpRegistry::CreateOp(
      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
  load_op->Run(scope, place);
  int* actual = target->data<int>();
  for (int64_t i = 0; i < tensor->numel(); ++i) {
    EXPECT_EQ(expect[i], actual[i]);
  }
  auto& actual_lod = target->lod();
  EXPECT_EQ(expect_lod.size(), actual_lod.size());
  for (size_t i = 0; i < expect_lod.size(); ++i) {
    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
    }
  }
 }
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
  auto var = scope.Var("test_var");
  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({10, 10});
+  tensor->Resize({3, 10});
  paddle::framework::LoD expect_lod;
  expect_lod.resize(1);
  expect_lod[0].push_back(0);