Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU. (#15493)

* Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU. test=develop * Refine the op benchmark to support setting lod in config. test=develop
6 years ago · f4634d76d7
parent 60546b78cc
commit f4634d76d7
5 changed files with 214 additions and 37 deletions
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/benchmark/op_tester.h"
+#include <fstream>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
@ -28,6 +29,7 @@ namespace operators {
 namespace benchmark {

 DEFINE_string(op_config_list, "", "Path of op config file.");
+DEFINE_int32(specified_config_id, -1, "Test the specified op config.");

 void OpTester::Init(const std::string &filename) {
  Init(OpTesterConfig(filename));
@ -147,7 +149,7 @@ void OpTester::CreateInputVarDesc() {
    var->SetShape(input->dims);

    op_desc_.SetInput(name, {var_name});
-    inputs_.push_back(var_name);
+    input_lods_[var_name] = input->lod;
  }
 }

@ -162,7 +164,6 @@ void OpTester::CreateOutputVarDesc() {
    var->SetDataType(framework::proto::VarType::FP32);

    op_desc_.SetOutput(name, {var_name});
-    outputs_.push_back(var_name);
  }
 }

@ -218,16 +219,26 @@ void OpTester::CreateVariables(framework::Scope *scope) {
    }
  }

-  // Allocate memory for input tensor
-  for (auto &name : inputs_) {
-    VLOG(3) << "Allocate memory for tensor " << name;
-    auto &var_desc = vars_[name];
+  for (auto &item : input_lods_) {
+    // Allocate memory for input tensor
+    auto &var_name = item.first;
+    VLOG(3) << "Allocate memory for tensor " << var_name;
+
+    auto &var_desc = vars_[var_name];
    std::vector<int64_t> shape = var_desc->GetShape();

-    auto *var = scope->Var(name);
+    auto *var = scope->Var(var_name);
    auto *tensor = var->GetMutable<framework::LoDTensor>();
    SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
                       static_cast<float>(1.0));
+
+    VLOG(3) << "Set lod for tensor " << var_name;
+    std::vector<std::vector<size_t>> &lod_vec = item.second;
+    framework::LoD lod;
+    for (size_t i = 0; i < lod_vec.size(); ++i) {
+      lod.push_back(lod_vec[i]);
+    }
+    tensor->set_lod(lod);
  }
 }

@ -282,10 +293,32 @@ std::string OpTester::DebugString() {
 }

 TEST(op_tester, base) {
-  OpTester tester;
  if (!FLAGS_op_config_list.empty()) {
-    tester.Init(FLAGS_op_config_list);
+    std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                   FLAGS_op_config_list.c_str());
+    std::vector<OpTesterConfig> op_configs;
+    while (!fin.eof()) {
+      OpTesterConfig config;
+      bool result = config.Init(fin);
+      if (result) {
+        op_configs.push_back(config);
+      }
+    }
+    if (FLAGS_specified_config_id >= 0 &&
+        FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
+      OpTester tester;
+      tester.Init(op_configs[FLAGS_specified_config_id]);
+      tester.Run();
+    } else {
+      for (size_t i = 0; i < op_configs.size(); ++i) {
+        OpTester tester;
+        tester.Init(op_configs[i]);
+        tester.Run();
+      }
+    }
  } else {
+    OpTester tester;
    OpTesterConfig config;
    config.op_type = "elementwise_add";
    config.inputs.resize(2);
@ -294,8 +327,8 @@ TEST(op_tester, base) {
    config.inputs[1].name = "Y";
    config.inputs[1].dims = {64, 1};
    tester.Init(config);
+    tester.Run();
  }
-  tester.Run();
 }

 }  // namespace benchmark
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@ -57,8 +57,7 @@ class OpTester {
  std::string type_;
  framework::OpDesc op_desc_;
  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
+  std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
  std::unique_ptr<framework::OperatorBase> op_;
  platform::Place place_;
  std::unique_ptr<framework::Scope> scope_;
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@ -33,21 +33,64 @@ static bool EndWith(const std::string& str, const std::string& substr) {
  return str.rfind(substr) == (str.length() - substr.length());
 }

-static void EraseEndSep(std::string* str) {
-  std::string substr = kSepBetweenItems;
+static void EraseEndSep(std::string* str,
+                        std::string substr = kSepBetweenItems) {
  if (EndWith(*str, substr)) {
    str->erase(str->length() - substr.length(), str->length());
  }
 }

-static std::vector<int64_t> ParseDims(std::string dims_str) {
-  std::vector<int64_t> dims;
+void OpInputConfig::ParseDims(std::istream& is) {
+  std::string dims_str;
+  is >> dims_str;
+
+  dims.clear();
  std::string token;
  std::istringstream token_stream(dims_str);
  while (std::getline(token_stream, token, 'x')) {
    dims.push_back(std::stoi(token));
  }
-  return dims;
+}
+
+void OpInputConfig::ParseLoD(std::istream& is) {
+  std::string lod_str;
+  std::string start_sep =
+      std::string(kStartSeparator) + std::string(kStartSeparator);
+  std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
+
+  std::string sep;
+  is >> sep;
+  if (StartWith(sep, start_sep)) {
+    lod_str += sep;
+    while (!EndWith(sep, end_sep)) {
+      is >> sep;
+      lod_str += sep;
+    }
+  }
+  EraseEndSep(&lod_str);
+  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
+
+  // Parse the lod_str
+  lod.clear();
+  for (size_t i = 1; i < lod_str.length() - 1;) {
+    if (lod_str[i] == '{') {
+      std::vector<size_t> level;
+      while (lod_str[i] != '}') {
+        ++i;
+
+        std::string number;
+        while (lod_str[i] >= '0' && lod_str[i] <= '9') {
+          number += lod_str[i];
+          ++i;
+        }
+        level.push_back(atoi(number.c_str()));
+      }
+      lod.push_back(level);
+    } else if (lod_str[i] == '}') {
+      ++i;
+    }
+  }
 }

 OpInputConfig::OpInputConfig(std::istream& is) {
@ -60,9 +103,9 @@ OpInputConfig::OpInputConfig(std::istream& is) {
        is >> name;
        EraseEndSep(&name);
      } else if (sep == "dims" || sep == "dims:") {
-        std::string dims_str;
-        is >> dims_str;
-        dims = ParseDims(dims_str);
+        ParseDims(is);
+      } else if (sep == "lod" || sep == "lod:") {
+        ParseLoD(is);
      }
    }
  }
@ -76,7 +119,7 @@ OpTesterConfig::OpTesterConfig(const std::string& filename) {
  Init(fin);
 }

-void OpTesterConfig::Init(std::istream& is) {
+bool OpTesterConfig::Init(std::istream& is) {
  std::string sep;
  is >> sep;
  if (sep == kStartSeparator) {
@ -95,9 +138,40 @@ void OpTesterConfig::Init(std::istream& is) {
      } else if (sep == "input" || sep == "input:") {
        OpInputConfig input_config(is);
        inputs.push_back(input_config);
+      } else if (sep == "attrs" || sep == "attrs:") {
+        ParseAttrs(is);
+      } else {
+        if (sep != kEndSeparator) {
+          return false;
+        }
      }
    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool OpTesterConfig::ParseAttrs(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (true) {
+      std::string key;
+      is >> key;
+      if (key == kEndSeparator) {
+        break;
+      }
+
+      std::string value;
+      is >> value;
+      EraseEndSep(&key, ":");
+      EraseEndSep(&value);
+
+      attrs[key] = value;
+    }
  }
+  return true;
 }

 const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@ -16,6 +16,7 @@ limitations under the License. */

 #include <istream>
 #include <string>
+#include <unordered_map>
 #include <vector>

 namespace paddle {
@ -26,19 +27,27 @@ struct OpInputConfig {
  OpInputConfig() {}
  explicit OpInputConfig(std::istream& is);

+  void ParseDims(std::istream& is);
+  void ParseLoD(std::istream& is);
+
  std::string name;
  std::vector<int64_t> dims;
+  std::vector<std::vector<size_t>> lod;
 };

 struct OpTesterConfig {
  OpTesterConfig() {}
  explicit OpTesterConfig(const std::string& filename);
-  void Init(std::istream& is);
+
+  bool Init(std::istream& is);
+
+  bool ParseAttrs(std::istream& is);

  const OpInputConfig* GetInput(const std::string& name);

  std::string op_type;
  std::vector<OpInputConfig> inputs;
+  std::unordered_map<std::string, std::string> attrs;
  int device_id{-1};  // CPU: -1
  int repeat{1};
  int profile{0};
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <algorithm>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
  }
 }

+template <typename T>
+static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
+                              const LoDTensor& x, LoDTensor* out,
+                              const framework::Vector<size_t>& x_lod,
+                              const framework::Vector<size_t>& ref_lod,
+                              bool do_copy) {
+  auto out_data = out->data<T>();
+  auto x_data = x.data<T>();
+
+  auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+
+  int x_item_length = x.numel() / x.dims()[0];
+  int out_offset = 0;
+  int num_copys = 0;
+  for (size_t i = 1; i < ref_lod.size(); ++i) {
+    int repeat_num = ref_lod[i] - ref_lod[i - 1];
+    int x_start = x_lod[i - 1];
+    int x_end = x_lod[i];
+    int x_seq_len = x_end - x_start;
+    if (repeat_num > 0) {
+      if (do_copy) {
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            memory::Copy(
+                gpu_place,
+                out_data + (out_start + j * x_seq_len + k) * x_item_length,
+                gpu_place, x_data + (x_start + k) * x_item_length,
+                sizeof(T) * x_item_length, context.stream());
+          }
+        }
+      } else {
+        num_copys += repeat_num * x_seq_len;
+      }
+    }
+    out_offset += repeat_num;
+  }
+  return num_copys;
+}
+
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
  void operator()(
@ -95,22 +139,40 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
      LoDTensor* out) {
-    int x_item_length = x.numel() / x.dims()[0];
-    framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, &out_offset);
-
-    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
-    int thread_y = 16;
-    int thread_z = 1024 / thread_x / thread_y;
-    int block_x = static_cast<int>(ref_lod.size());
-    dim3 block_size(thread_x, thread_y, thread_z);
-    dim3 grid_size(block_x, 1);
+    int num_copys =
+        ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (num_copys < 5) {
+      ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
+    } else {
+      int x_item_length = x.numel() / x.dims()[0];
+      size_t x_lod_size = x_lod.size();
+      framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
+      GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+      for (size_t i = 0; i < x_lod_size; ++i) {
+        out_offset[x_lod_size + i] = x_lod[i];
+      }
+      for (size_t i = 0; i < ref_lod.size(); ++i) {
+        out_offset[2 * x_lod_size + i] = ref_lod[i];
+      }

-    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
-        ref_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
-        out->mutable_data<T>(context.GetPlace()));
+      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      const size_t* x_lod_data = out_offset_data + x_lod_size;
+      const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
+
+      int thread_x =
+          std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+      int thread_y = 16;
+      int thread_z = 1024 / thread_x / thread_y;
+      int block_x = static_cast<int>(ref_lod.size());
+      dim3 block_size(thread_x, thread_y, thread_z);
+      dim3 grid_size(block_x, 1);
+
+      sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+          x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
+          x_item_length, out->mutable_data<T>(context.GetPlace()));
+    }
  }
 };