From d6abf2b531b3ea5688b3d61df8a450837cc8312e Mon Sep 17 00:00:00 2001
From: yangjie159 <yangjie159@huawei.com>
Date: Fri, 19 Feb 2021 15:34:43 +0800
Subject: [PATCH] fix bugs of micro, and add deconvolution int8 coder

---
 mindspore/lite/micro/CMakeLists.txt           |   2 +-
 mindspore/lite/micro/cmake/file_list.cmake    |   3 +
 .../component/benchmark_component.cc          |   2 +-
 .../generator/component/train_component.cc    |   2 +-
 .../coder/generator/train/train_generator.cc  |  14 +-
 .../nnacl/int8/deconvolution_int8_coder.cc    | 161 ++++++++++++++++++
 .../nnacl/int8/deconvolution_int8_coder.h     |  63 +++++++
 mindspore/lite/micro/coder/session.cc         |   3 +-
 8 files changed, 233 insertions(+), 17 deletions(-)
 create mode 100644 mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.cc
 create mode 100644 mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.h

diff --git a/mindspore/lite/micro/CMakeLists.txt b/mindspore/lite/micro/CMakeLists.txt
index b933d78f82..d49c00d88f 100644
--- a/mindspore/lite/micro/CMakeLists.txt
+++ b/mindspore/lite/micro/CMakeLists.txt
@@ -35,7 +35,7 @@ ms_build_flatbuffers_lite(FBS_FILES
 if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
     MESSAGE("******Micro Debug********")
     set(CMAKE_C_FLAGS "-Wall -Werror -ftrapv -DDebug -g -fvisibility=default ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-Wall -Werror-ftrapv -DDebug -g -fvisibility=default ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-Wall -Werror -ftrapv -DDebug -g -fvisibility=default ${CMAKE_CXX_FLAGS}")
 else()
     MESSAGE(" ******Micro Release********")
     set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror \
diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake
index e0842dd1d1..80b6f3d413 100644
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -84,6 +84,7 @@ set(CODER_OPCODERS_SRC
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/int8/deconvolution_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/reduce_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/reshape_int8_coder.cc
@@ -129,6 +130,8 @@ set(LITE_KERNEL_SRC
         ${LITE_DIR}/nnacl/int8/conv3x3_int8.c
         ${LITE_DIR}/nnacl/int8/conv1x1_int8.c
         ${LITE_DIR}/nnacl/base/conv1x1_base.c
+        ${LITE_DIR}/nnacl/int8/deconv_int8.c
+        ${LITE_DIR}/nnacl/int8/common_func_int8.c
         )
 
 list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
diff --git a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
index 3b42e2a2bf..71ab14bfdd 100644
--- a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
@@ -184,7 +184,7 @@ void CodeBenchmarkFreeResourse(std::ofstream &ofs, const std::string &module_nam
   ofs << "  for (int i = 0; i < " << inputs_num << "; ++i) {\n";
   ofs << "    free(inputs_binbuf[i]);\n"
          "  }\n"
-         "  return RET_OK;"
+         "  return RET_OK;\n"
          "}\n\n";
 }
 
diff --git a/mindspore/lite/micro/coder/generator/component/train_component.cc b/mindspore/lite/micro/coder/generator/component/train_component.cc
index 66b5512f23..866b284418 100644
--- a/mindspore/lite/micro/coder/generator/component/train_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/train_component.cc
@@ -133,7 +133,7 @@ void CodeTrainImplement(std::ofstream &ofs, const std::string &module_name, cons
     result += "}";
     return result;
   };
-  auto wrap = [](int i) { return "[" + std::to_string(i) + "]"; };
+  auto wrap = [](size_t i) { return "[" + std::to_string(i) + "]"; };
   auto offset_inputs = [&]() {
     std::string src = "origin_inputs";
     std::string dst = "input_ptr";
diff --git a/mindspore/lite/micro/coder/generator/train/train_generator.cc b/mindspore/lite/micro/coder/generator/train/train_generator.cc
index b8aa287098..bfb7d87d0b 100644
--- a/mindspore/lite/micro/coder/generator/train/train_generator.cc
+++ b/mindspore/lite/micro/coder/generator/train/train_generator.cc
@@ -27,7 +27,7 @@ void TrainGenerator::CodeGradientFunc(std::ofstream &ofs) const {
   ofs << "float " << config_->module_name() << "_ComputeLossAndGradient() {\n";
   ofs << "  float loss = 0;\n";
   for (const auto &block : ctx_->train_blocks()) {
-    ofs << "  {\n" << block << "  }\n";
+    ofs << "\t{\n" << block << "\t}\n";
   }
   ofs << "  return loss;\n";
   ofs << "}\n";
@@ -45,9 +45,6 @@ int TrainGenerator::CodeNetHFile() {
   ofs << "#include \"microtensor.h\"\n\n";
   CodeTrainParams(ofs);
   CodeInputAndOutputState(ofs, config_->module_name());
-  if (is_get_quant_args_) {
-    CodeGraphQuantArgsState(ofs, config_->module_name());
-  }
   if (config_->is_weight_file()) {
     CodeInitWeightState(ofs, config_->module_name());
   }
@@ -68,9 +65,6 @@ int TrainGenerator::CodeNetCFile() {
   CodeInitResourceImplement(ofs, config_->module_name(), ctx_);
   CodeFreeResourceImplement(ofs, config_->module_name(), ctx_);
   CodeFeaturesImplement(ofs, config_->module_name(), ctx_);
-  if (is_get_quant_args_) {
-    CodeGraphQuantArgsImplement(ofs, config_->module_name(), ctx_);
-  }
   CodeNetRunFunc(ofs);
   CodeGradientFunc(ofs);
   CodeTrainImplement(ofs, config_->module_name(), ctx_);
@@ -85,22 +79,16 @@ int TrainGenerator::CodeBenchmarkFile() {
   MS_CHECK_TRUE(!ofs.bad(), "filed to open file");
   std::vector<Tensor *> inputs = ctx_->graph_inputs();
   size_t inputs_num = inputs.size();
-
   CodeBenchmarkHeader(ofs, net_inc_hfile_);
   CodeBenchmarkUsage(ofs);
   CodeBenchmarkWarmup(ofs, config_->module_name());
-
   CodeBenchmarkSetInputs(ofs, config_->module_name(), ctx_);
   CodeBenchmarkSetBuffer(ofs, config_->module_name());
   if (config_->is_weight_file()) {
     CodeBenchmarkInitWeight(ofs, config_->module_name());
   }
-  if (config_->code_mode() == CodeMode::Code_Inference) {
-    CodeBenchmarkConfigThread(ofs);
-  }
   CodeBenchmarkInference(ofs, config_->module_name());
   CodeBenchmarkPrintOutputs(ofs, config_->module_name());
-
   CodeBenchmarkFreeResourse(ofs, config_->module_name(), inputs_num);
   ofs.close();
   return RET_OK;
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.cc
new file mode 100644
index 0000000000..ea9242fcc8
--- /dev/null
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.cc
@@ -0,0 +1,161 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.h"
+#include <vector>
+#include "nnacl/int8/deconv_int8.h"
+#include "micro/coder/opcoders/file_collector.h"
+#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
+
+using mindspore::schema::PrimitiveType_DeConv2D;
+
+namespace mindspore::lite::micro::nnacl {
+
+int DeconvolutionInt8Coder::Init(CoderContext *const context) {
+  CheckSupportOptimize();
+  MS_CHECK_RET_CODE(SetQuantParam(), "deconv int8 SetQuantParam error!");
+  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2DBaseCoder SetQuantParam error!");
+  MS_CHECK_RET_CODE(InitParam(), "deconv int8 InitParam error!");
+  MS_CHECK_RET_CODE(InitBiasWeight(context), "deconv int8 InitBiasWeight error!");
+  MS_CHECK_RET_CODE(InitData(context), "deconv int8 InitData error!");
+  return RET_OK;
+}
+
+int DeconvolutionInt8Coder::Prepare(CoderContext *const context) {
+  conv_param_->thread_num_ = thread_num_;
+  conv_param_->op_parameter_.thread_num_ = thread_num_;
+  thread_count_ = thread_num_;
+  MS_CHECK_RET_CODE(Init(context), "deconv int8 Init error!");
+  MS_CHECK_RET_CODE(InitRunBuf(context), "deconv int8 InitRunBuf error!");
+  return 0;
+}
+
+void DeconvolutionInt8Coder::CheckSupportOptimize() {
+  support_optimize_ = false;
+  matmul_func_str_ = "NULL";
+}
+
+int DeconvolutionInt8Coder::InitParam() {
+  matmul_param_ = new (std::nothrow) MatMulParameter();
+  MS_CHECK_PTR(matmul_param_);
+  matmul_param_->row_ = conv_param_->input_h_ * conv_param_->input_w_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->col_ = conv_param_->output_channel_ * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+
+  /* optimize normal -> same data layout */
+  int oc4 = UP_DIV(conv_param_->output_channel_, C4NUM);
+  thread_count_ = MSMIN(conv_param_->op_parameter_.thread_num_, oc4);
+  MS_CHECK_TRUE(thread_count_ > 0, "thread_count_ <= 0");
+  thread_stride_ = UP_DIV(oc4, thread_count_);
+  return RET_OK;
+}
+
+int DeconvolutionInt8Coder::InitBiasWeight(CoderContext *const context) {
+  MS_CHECK_TRUE(conv_param_->output_channel_ > 0, "invalid output_channel");
+  int size = UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(int32_t);
+  bias_data_ = reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, size, kOfflinePackWeight));
+  MS_CHECK_PTR(bias_data_);
+  MS_CHECK_RET_CODE(memset_s(bias_data_, size, 0, size), "memset_s new_bias_addr_ failed.");
+  if (input_tensors_.size() == kInputSize2) {
+    auto *ori_bias_addr = reinterpret_cast<int32_t *>(bias_tensor_->data_c());
+    MS_CHECK_RET_CODE(memcpy_s(bias_data_, conv_param_->output_channel_ * sizeof(int32_t), ori_bias_addr,
+                               conv_param_->output_channel_ * sizeof(int32_t)),
+                      "memcpy_s new_bias_addr_ failed.");
+  }
+
+  size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(conv_param_->input_channel_, C16NUM) *
+         conv_param_->kernel_w_ * conv_param_->kernel_h_ * sizeof(int8_t);
+  weight_ptr_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, size, kOfflinePackWeight));
+  MS_CHECK_PTR(weight_ptr_);
+  MS_CHECK_RET_CODE(
+    memset_s(weight_ptr_, size, static_cast<int8_t>(conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_), size),
+    "memset_s weight_ptr_ failed.");
+  DeConvWeightTransInt8(reinterpret_cast<int8_t *>(filter_tensor_->data_c()), weight_ptr_, conv_param_->input_channel_,
+                        conv_param_->output_channel_, conv_param_->kernel_h_ * conv_param_->kernel_w_,
+                        support_optimize_);
+
+  size = UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  weight_sum_ =
+    reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, size * sizeof(int32_t), kOfflinePackWeight));
+  MS_CHECK_PTR(weight_sum_);
+  MS_CHECK_RET_CODE(memset_s(weight_sum_, size * sizeof(int32_t), 0, size * sizeof(int32_t)),
+                    "memset_s weight_sum_ failed.");
+  DeConvPackWeightSum(weight_ptr_, weight_sum_, conv_param_->conv_quant_arg_.input_quant_args_[0].zp_,
+                      conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, UP_ROUND(matmul_param_->deep_, C16NUM),
+                      size, support_optimize_);
+
+  return RET_OK;
+}
+
+int DeconvolutionInt8Coder::InitData(CoderContext *const context) {
+  input_ptr_size_ = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C4NUM) *
+                    UP_ROUND(conv_param_->input_channel_, C16NUM) * sizeof(int8_t);
+  input_ptr_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, input_ptr_size_, kWorkspace));
+  return RET_OK;
+}
+
+int DeconvolutionInt8Coder::InitRunBuf(CoderContext *const context) {
+  tmp_buffer_size_ = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C4NUM) *
+                     UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->kernel_w_ * conv_param_->kernel_h_ *
+                     sizeof(int32_t);
+  tmp_buffer_ = reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, tmp_buffer_size_, kWorkspace));
+
+  tmp_output_size_ =
+    UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(int32_t);
+  tmp_output_ = reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, tmp_output_size_, kWorkspace));
+
+  input_sum_size_ = UP_ROUND(matmul_param_->row_, C4NUM) * sizeof(int32_t);
+  input_sum_ = reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, input_sum_size_, kWorkspace));
+  return RET_OK;
+}
+
+int DeconvolutionInt8Coder::DoCode(CoderContext *const context) {
+  Collect(context, {"nnacl/int8/deconv.h"}, {"int8/deconv.c", "pack.c", "quantization/fixed_point.c"});
+
+  nnacl::NNaclInt8Serializer code;
+  code.CodeFunction("memset", input_ptr_, 0, input_ptr_size_);
+  code.CodeFunction("memset", tmp_buffer_, 0, tmp_buffer_size_);
+  code.CodeFunction("memset", tmp_output_, 0, tmp_output_size_);
+  code.CodeFunction("memset", input_sum_, 0, input_sum_size_);
+
+  // define conv params
+  code.CodeStruct("conv_param_", *conv_param_);
+
+  MS_CHECK_TRUE(conv_param_->input_batch_ == 1, "batch number should be 1.");
+
+  code.CodeFunction("RowMajor2Row16x4MajorInt8", input_tensor_, input_ptr_, matmul_param_->row_, matmul_param_->deep_);
+  code.CodeFunction("DeConvPackInputSum", input_ptr_, input_sum_,
+                    conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, UP_ROUND(matmul_param_->row_, C4NUM),
+                    UP_ROUND(matmul_param_->deep_, C16NUM), support_optimize_);
+
+  int kernel_plane = conv_param_->kernel_w_ * conv_param_->kernel_h_;
+  int cur_oc = MSMIN(thread_stride_, UP_DIV(conv_param_->output_channel_, C8NUM));
+  int cur_oc_res = MSMIN(thread_stride_ * C4NUM, conv_param_->output_channel_);
+
+  MS_CHECK_TRUE(cur_oc > 0, "cur_oc should be greater than 0.");
+
+  code.CodeFunction("DeConvInt8", input_ptr_, weight_ptr_, tmp_buffer_, weight_sum_, input_sum_,
+                    UP_ROUND(matmul_param_->row_, C4NUM), cur_oc * C4NUM * kernel_plane,
+                    UP_ROUND(matmul_param_->deep_, C16NUM), "&conv_param_", matmul_func_str_);
+
+  code.CodeFunction("DeConvPostInt8", tmp_buffer_, bias_data_, tmp_output_, output_tensor_, cur_oc_res, "&conv_param_",
+                    support_optimize_);
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_DeConv2D, CPUOpCoderCreator<DeconvolutionInt8Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.h
new file mode 100644
index 0000000000..7b5839f3bf
--- /dev/null
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/deconvolution_int8_coder.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_DECONVOLUTION_INT8_CODER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_DECONVOLUTION_INT8_CODER_H_
+
+#include <vector>
+#include <string>
+#include "coder/opcoders/base/conv2d_base_coder.h"
+#include "nnacl/matmul_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class DeconvolutionInt8Coder final : public Conv2DBaseCoder {
+ public:
+  DeconvolutionInt8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                         const Model::Node *node, size_t node_index, Target target)
+      : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~DeconvolutionInt8Coder() override { delete matmul_param_; }
+
+  int DoCode(CoderContext *const context) override;
+  int Prepare(CoderContext *const context) override;
+
+ private:
+  int Init(CoderContext *const context);
+  int InitData(CoderContext *ctx);
+  int InitParam();
+  int InitBiasWeight(CoderContext *ctx);
+  void CheckSupportOptimize();
+  int InitRunBuf(CoderContext *ctx);
+
+  int32_t *tmp_buffer_{nullptr};
+  int tmp_buffer_size_{0};
+  int32_t *tmp_output_{nullptr};
+  int tmp_output_size_{0};
+  int32_t *input_sum_{nullptr};
+  int input_sum_size_{0};
+
+  int8_t *input_ptr_{nullptr};
+  int input_ptr_size_{0};
+  int8_t *weight_ptr_{nullptr};
+  int32_t *weight_sum_{nullptr};
+  size_t thread_count_{1};
+  int thread_stride_{0};
+  int32_t *bias_data_{nullptr};
+  std::string matmul_func_str_;
+  MatMulParameter *matmul_param_{nullptr};
+  bool support_optimize_{true};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_DECONV_INT8_CODER_H_
diff --git a/mindspore/lite/micro/coder/session.cc b/mindspore/lite/micro/coder/session.cc
index 777502a83e..8e1c349f3b 100644
--- a/mindspore/lite/micro/coder/session.cc
+++ b/mindspore/lite/micro/coder/session.cc
@@ -132,7 +132,7 @@ int CoderSession::GenerateCode() {
       generator = std::make_shared<InferenceGenerator>(std::move(context_));
       break;
     case Code_Train:
-      MS_LOG(INFO) << "generate code for Inference";
+      MS_LOG(INFO) << "generate code for Train";
       generator = std::make_shared<TrainGenerator>(std::move(context_));
       break;
     default:
@@ -141,6 +141,7 @@ int CoderSession::GenerateCode() {
   }
   // when use file, coder context need to remove initial parameters from tensors info
   // we use tmp_tensor_list to storage
+  MS_CHECK_PTR(generator);
   int ret = generator->GenerateCode();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "generate code failed";