Add support for INT8 matmul in C-API quantization (#23463)

* Integrate matmul with cpu_quantize_pass test=develop * Add matmul checking scales test=develop * Change condition of matmul quantization test=develop * Remove redundant var test=develop
5 years ago · ce08fdcf2b
parent c068512f34
commit ce08fdcf2b
6 changed files with 204 additions and 10 deletions
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -1349,6 +1349,27 @@ PDNode *patterns::Reshape::operator()() {
  return reshape_out;
 }
 PDNode *patterns::Matmul::operator()() {
  auto prev_op_x = pattern->NewNode(prev_op_x_repr())->assert_is_op();
  auto prev_op_y = pattern->NewNode(prev_op_y_repr())->assert_is_op();
  auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
  auto matmul_in_x = pattern->NewNode(matmul_in_x_repr())
                         ->AsInput()
                         ->assert_is_op_input("matmul", "X");
  auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
                         ->AsInput()
                         ->assert_is_op_input("matmul", "Y");
  auto matmul_out = pattern->NewNode(matmul_out_repr())
                        ->AsOutput()
                        ->assert_is_op_output("matmul", "Out");
  prev_op_x->LinksTo({matmul_in_x});
  prev_op_y->LinksTo({matmul_in_y});
  matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out});
  return matmul_out;
 }
 PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -835,6 +835,22 @@ struct Reshape : public PatternBase {
  PATTERN_DECL_NODE(next_op);
 };
 // Matmul op
 // Forward pass for matmul.
 // matmul_out is a result of the operator.
 struct Matmul : public PatternBase {
  Matmul(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "reshape2") {}
  PDNode* operator()();
  PATTERN_DECL_NODE(prev_op_x);
  PATTERN_DECL_NODE(prev_op_y);
  PATTERN_DECL_NODE(matmul_in_x);
  PATTERN_DECL_NODE(matmul_in_y);
  PATTERN_DECL_NODE(matmul_op);
  PATTERN_DECL_NODE(matmul_out);
 };
 // Concat op
 // Forward pass for concat.
 // concat_out is a result of the operator.
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@ -190,6 +190,16 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node,
  return scale_data.second.data<double>()[0];
 }
 bool CPUQuantizePass::IsOpDequantized(const Node* node) const {
  return node->Op()->Type() == "dequantize" ||
         node->Op()->GetAttrIfExists<bool>("use_quantizer");
 }
 bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
  return node->Op()->Type() == "quantize" ||
         node->Op()->GetAttrIfExists<bool>("use_quantizer");
 }
 void CPUQuantizePass::QuantizeConv(Graph* graph,
                                   bool with_residual_data) const {
  GraphPatternDetector gpd;
@ -449,11 +459,8 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern);
-    // skip if prev op and next op are not quantized
+    // skip if prev op and next op is not quantized
-    if (!(prev_op->Op()->Type() == "dequantize" ||
+    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
        !(next_op->Op()->Type() == "quantize" ||
          (next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
      return;
    }
    GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern);
@ -500,11 +507,8 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, reshape_pattern);
-    // skip if prev op  and next op is not quantized
+    // skip if prev op and next op is not quantized
-    if (!(prev_op->Op()->Type() == "dequantize" ||
+    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
        !(next_op->Op()->Type() == "quantize" ||
          (next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
      return;
    }
@ -530,6 +534,59 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
  PrettyLogDetail("---    quantized %d reshape ops", quantize_reshape_count);
 }
 void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
  GraphPatternDetector gpd;
  auto pattern = gpd.mutable_pattern();
  patterns::Matmul matmul_pattern{pattern, name_scope_};
  matmul_pattern();
  int quantize_matmul_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "Quantize matmul op";
    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
    auto* matmul_op_desc = matmul_op->Op();
    // skip if should not be quantized
    if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
      return;
    }
    GET_IR_NODE_FROM_SUBGRAPH(prev_op_x, prev_op_x, matmul_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(prev_op_y, prev_op_y, matmul_pattern);
    // skip if prev ops are not quantized
    if (!IsOpDequantized(prev_op_x) || !IsOpDequantized(prev_op_y)) {
      return;
    }
    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
    bool is_x_unsigned{false}, is_y_unsigned{false};
    auto input_x_scale = GetScaleValueForNode(matmul_in_x, &is_x_unsigned);
    auto input_y_scale = GetScaleValueForNode(matmul_in_y, &is_y_unsigned);
    PADDLE_ENFORCE_EQ(
        is_x_unsigned, is_y_unsigned,
        platform::errors::InvalidArgument(
            "Matmul inputs should have the same value of is_unsigned"));
    QuantizeInput(g, matmul_op, matmul_in_x, "X", input_x_scale, is_x_unsigned,
                  "Scale_x");
    QuantizeInput(g, matmul_op, matmul_in_y, "Y", input_y_scale, is_y_unsigned,
                  "Scale_y");
    bool is_output_unsigned{false};
    auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned);
    DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale,
                     is_output_unsigned, "Scale_out");
    ++quantize_matmul_count;
  };
  gpd(graph, handler);
  AddStatis(quantize_matmul_count);
  PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
 }
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  VLOG(3) << "Quantizing the graph.";
  PADDLE_ENFORCE(graph);
@ -545,6 +602,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  QuantizeTranspose(graph);
  QuantizeFc(graph);
  QuantizeReshape(graph);
  QuantizeMatmul(graph);
 }
 }  // namespace ir
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@ -58,6 +58,8 @@ class CPUQuantizePass : public FusePassBase {
  void QuantizeReshape(Graph* graph) const;
  void QuantizeMatmul(Graph* graph) const;
  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                     double scale_to_one, bool is_unsigned,
                     std::string scale_attr_name = "") const;
@ -76,6 +78,8 @@ class CPUQuantizePass : public FusePassBase {
  LoDTensor GetScaleTensorForNode(const Node* node) const;
  double GetScaleValueForNode(const Node* node,
                              bool* is_unsigned = nullptr) const;
  bool IsOpDequantized(const Node* node) const;
  bool IsOpQuantized(const Node* node) const;
  const std::string name_scope_{"quantize"};
 };
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@ -74,6 +74,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("Input", {inputs[0]});
    op->SetOutput("Output", {outputs[0]});
    op->SetAttr("Scale", 1.0f);
  } else if (type == "matmul") {
    op->SetInput("X", {inputs[0]});
    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
    op->SetOutput("Out", {outputs[0]});
    op->SetAttr("use_quantizer", use_quantizer);
    op->SetAttr("Scale_x", 1.0f);
    op->SetAttr("Scale_y", 1.0f);
    op->SetAttr("Scale_out", 1.0f);
  }
 }
@ -513,6 +521,89 @@ TEST(CPUQuantizePass, check_scales) {
  MainTestCheckScales(BuildProgramDescCheckScalesConv(), var_names, "a");
 }
 static const std::initializer_list<std::string> variable_names_matmul = {
    "a", "b", "c", "d", "e", "f"};
 ProgramDesc BuildProgramDescMatmul() {
  ProgramDesc prog;
  for (auto& v : variable_names_transpose) {
    prog.MutableBlock(0)->Var(v);
  }
  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
  return prog;
 }
 ProgramDesc BuildProgramDescMatmulNotQuantized() {
  ProgramDesc prog;
  for (auto& v : variable_names_transpose) {
    prog.MutableBlock(0)->Var(v);
  }
  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
  SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
  return prog;
 }
 void MainTestMatmul(const ProgramDesc& prog, int matmul_count, int quant_count,
                    int dequant_count, int added_nodes_count, float scale) {
  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
  int original_nodes_num, current_nodes_num;
  PreparePass(&graph, prog, variable_names_matmul, &original_nodes_num,
              &current_nodes_num);
  int quantize_nodes_count = 0;
  int dequantize_nodes_count = 0;
  int matmul_nodes_count = 0;
  for (auto* node : graph->Nodes()) {
    if (node->IsOp()) {
      auto* op = node->Op();
      if (op->Type() == "matmul") {
        matmul_nodes_count++;
        auto op_name = boost::get<std::string>(op->GetAttr("name"));
        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_x")), scale)
            << "Scale_x for node '" + op_name + "'.";
        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_y")), scale)
            << "Scale_y for node '" + op_name + "'.";
        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
            << "Scale_out for node '" + op_name + "'.";
      } else if (op->Type() == "quantize") {
        quantize_nodes_count++;
      } else if (op->Type() == "dequantize") {
        dequantize_nodes_count++;
      }
    }
  }
  EXPECT_EQ(matmul_nodes_count, matmul_count);
  EXPECT_EQ(quantize_nodes_count, quant_count);
  EXPECT_EQ(dequantize_nodes_count, dequant_count);
  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
 }
 TEST(CpuQuantizePass, matmul) {
  int matmul_count = 1;
  int quant_count = 2;
  int dequant_count = 3;
  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
  int added_nodes_count = 6;
  MainTestMatmul(BuildProgramDescMatmul(), matmul_count, quant_count,
                 dequant_count, added_nodes_count, 2.0f * 127);
 }
 TEST(CpuQuantizePass, matmul_not_quantized) {
  int matmul_count = 1;
  int quant_count = 0;
  int dequant_count = 1;
  // nothing change
  int added_nodes_count = 0;
  MainTestMatmul(BuildProgramDescMatmulNotQuantized(), matmul_count,
                 quant_count, dequant_count, added_nodes_count, 1.0f);
 }
 }  // namespace
 }  // namespace ir
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@ -45,6 +45,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
  rules_["fc"]["Bias"] = ScaleAlgo::NONE;
  rules_["fc"]["Out"] = ScaleAlgo::KL;
  rules_["matmul"]["X"] = ScaleAlgo::KL;
  rules_["matmul"]["Y"] = ScaleAlgo::KL;
  rules_["matmul"]["Out"] = ScaleAlgo::KL;
  // Reshape2 does not perform calculation on the data and shapes are not
  // changed. Scale is calculated on input data and assign to Quantize and
  // Dequantize scale.