fix bug

5 years ago · 127b089a11
parent 030af09f60
commit 127b089a11
9 changed files with 142 additions and 131 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@ -296,16 +296,6 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {
    MS_LOG(ERROR) << "Set Output Tensor Quant Param Failed.";
    return ret;
  }
-  ret = SetQuantMultiplier();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Set Quant Multiplier Failed.";
-    return ret;
-  }
-  // now only consider per tensor for output
-  CalculateActivationRangeQuantized(
-    conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
-    conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0],
-    &conv_param_->conv_quant_arg_.out_act_max_[0]);

  ret = SetIfPerChannel();
  if (ret != RET_OK) {
@ -317,6 +307,18 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {
    MS_LOG(ERROR) << "Set if per asymmetric failed.";
    return ret;
  }
+
+  ret = SetQuantMultiplier();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set Quant Multiplier Failed.";
+    return ret;
+  }
+  // now only consider per tensor for output
+  CalculateActivationRangeQuantized(
+    conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
+    conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0],
+    &conv_param_->conv_quant_arg_.out_act_max_[0]);
+
  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@ -17,6 +17,7 @@
 #include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
 #include <vector>
 #include "src/runtime/kernel/arm/fp16/convolution_sw_fp16.h"
+#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
 #include "src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h"
@ -243,6 +244,10 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::tensor::Ten
    InputTransformUnitFunc input_trans_func = nullptr;
    OutputTransformUnitFunc output_trans_func = nullptr;
    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func);
+    if (use_winograd) {
+      kernel = new (std::nothrow)
+        kernel::ConvolutionWinogradFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive, out_unit);
+    }
    if (kernel_h != 1 && kernel_w != 1 && !use_winograd) {
      kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@ -51,7 +51,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
  // #endif
  int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane;

-  // init weight
+  // =====================init weight==========================//
  auto origin_weight = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->Data());
  packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
  if (packed_weight_ == nullptr) {
@ -61,7 +61,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
  PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num);

-  // init bias
+  // =======================init bias==========================//
  bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "malloc bias failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c
@ -633,7 +633,9 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d
                                 OutputTransformUnitFp16Func output_trans_func) {
  int output_unit = conv_param->output_unit_;
  int output_w = conv_param->output_w_;
-  int output_unit_block = UP_DIV(output_w, output_unit);
+  int output_h = conv_param->output_h_;
+  int output_w_unit_block = UP_DIV(output_w, output_unit);
+  int output_h_unit_block = UP_DIV(output_h, output_unit);
  int output_channel = conv_param->output_channel_;
  int oc8 = UP_DIV(output_channel, C8NUM);
  int input_unit = conv_param->input_unit_;
@ -644,16 +646,16 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d
    int dst_x_s = out_tile_index % output_unit_num;
    int dst_y_s = out_tile_index / output_unit_num;
    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
-    int dst_tile_offset = C8NUM * output_unit * (dst_x_s + dst_y_s * output_unit_block * output_unit);
+    int dst_tile_offset = C8NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit);

    for (int j = 0; j < oc8; j++) {
      int src_oc8_offset = src_tile_offset + j * input_unit * input_unit * C8NUM;
      int dst_oc8_offset =
-        dst_tile_offset + j * C8NUM * output_unit_block * output_unit_block * output_unit * output_unit;
+        dst_tile_offset + j * C8NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit;
      const float16_t *src_ptr = gemm_out + src_oc8_offset;
      const float16_t *bias_ptr = bias_data + j * C8NUM;
      float16_t *dst_ptr = tmp_out_data + dst_oc8_offset;
-      output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_unit_block * output_unit);
+      output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
    }
    out_tile_index++;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c
@ -1066,7 +1066,7 @@ void OutputTransform4x3UnitFp16(const float16_t *src_data, float16_t *dst_data,
    const float16_t t10 = 0.5f * (src_data_10 - src_data_20);
    const float16_t t11 = 0.5f * (src_data_11 - src_data_21);
    const float16_t t12 = 0.5f * (src_data_12 - src_data_22);
-    const const float16_t t13 = 0.5f * (src_data_13 - src_data_23);
+    const float16_t t13 = 0.5f * (src_data_13 - src_data_23);

    const float16_t t20 = 0.25f * (src_data_10 + src_data_20) + src_data_30;
    const float16_t t21 = 0.25f * (src_data_11 + src_data_21) + src_data_31;
@ -2232,7 +2232,7 @@ void OutputTransform8x4UnitFp16(const float16_t *src_data, float16_t *dst_data,
    const float16_t t24 = 0.25f * d35 + d45 + 2.25f * d55;
    const float16_t t25 = 0.25f * d36 + d46 + 2.25f * d56;
    const float16_t t26 = 0.25f * d37 + d47 + 2.25f * d57;
-    const const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58;
+    const float16_t t27 = 0.25f * d38 + d48 + 2.25f * d58;

    const float16_t t30 = 0.125f * d01 + d11 + 3.375f * d21 + src_data_70;
    const float16_t t31 = 0.125f * d02 + d12 + 3.375f * d22 + src_data_71;
@ -3392,7 +3392,7 @@ void OutputTransform8x6UnitFp16(const float16_t *src_data, float16_t *dst_data,
    const float16_t t52 = 0.03125f * d03 + d13 + 7.59375f * d23 + src_data_72;
    const float16_t t53 = 0.03125f * d04 + d14 + 7.59375f * d24 + src_data_73;
    const float16_t t54 = 0.03125f * d05 + d15 + 7.59375f * d25 + src_data_74;
-    const const float16_t t55 = 0.03125f * d06 + d16 + 7.59375f * d26 + src_data_75;
+    const float16_t t55 = 0.03125f * d06 + d16 + 7.59375f * d26 + src_data_75;
    const float16_t t56 = 0.03125f * d07 + d17 + 7.59375f * d27 + src_data_76;
    const float16_t t57 = 0.03125f * d08 + d18 + 7.59375f * d28 + src_data_77;

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/conv_int8.c
@ -325,7 +325,6 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
-      // todo
      int32_t *tmp_input_sum = input_sum + task_id * tile_n;
      int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      // clear tmp buffer before compute
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
@ -295,12 +295,12 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
      }    // kernel_w loop
    }      // kernel_h loop
    if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
-      return;
+      continue;
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
               (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
      int cal_num_offset = i * conv_param->output_channel_;
      for (int l = 0; l < conv_param->output_channel_; ++l) {
-        input_sum[cal_num_offset + l] = input_accumulator * filter_arg[i].zp_;
+        input_sum[cal_num_offset + l] = input_accumulator * filter_arg[l].zp_;
      }
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
               !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
@ -367,12 +367,12 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
      }
    }
    if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
-      return;
+      continue;
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
               (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
      int cal_num_offset = i * conv_param->output_channel_;
      for (int l = 0; l < conv_param->output_channel_; ++l) {
-        input_sum[cal_num_offset + l] = input_accumulator * filter_arg[i].zp_;
+        input_sum[cal_num_offset + l] = input_accumulator * filter_arg[l].zp_;
      }
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
               !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
@ -870,8 +870,8 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int
  int c8 = channel / C8NUM * C8NUM;
  int batch = plane * channel;
  for (int n = 0; n < batches; n++) {
-    const float *src_batch = (const float*) src + n * batch;
-    float *dst_batch = (float*) dst + n * batch;
+    const float *src_batch = (const float *)src + n * batch;
+    float *dst_batch = (float *)dst + n * batch;
    int hw = 0;
    for (; hw < hw8; hw += C8NUM) {
      int c = 0;
@ -947,9 +947,10 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int
          "st1 {v30.4s, v31.4s}, [x11], %[dstStride]\n"

          :
-          : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
+          :
+          [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
          : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
-            "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", 
+            "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
            "v30", "v31");
 #else
        for (int tr = 0; tr < C8NUM; tr++) {
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/winograd_transform.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/winograd_transform.c
@ -81,7 +81,9 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
                             OutputTransformUnitFunc output_trans_func) {
  int output_unit = conv_param->output_unit_;
  int output_w = conv_param->output_w_;
-  int output_unit_block = UP_DIV(output_w, output_unit);
+  int output_h = conv_param->output_h_;
+  int output_w_unit_block = UP_DIV(output_w, output_unit);
+  int output_h_unit_block = UP_DIV(output_h, output_unit);
  int output_channel = conv_param->output_channel_;
  int oc4 = UP_DIV(output_channel, C4NUM);
  int input_unit = conv_param->input_unit_;
@ -92,16 +94,16 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
    int dst_x_s = out_tile_index % output_unit_num;
    int dst_y_s = out_tile_index / output_unit_num;
    int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
-    int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_unit_block * output_unit);
+    int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit);

    for (int j = 0; j < oc4; j++) {
      int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
      int dst_oc4_offset =
-        dst_tile_offset + j * C4NUM * output_unit_block * output_unit_block * output_unit * output_unit;
+        dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit;
      const float *src_ptr = gemm_out + src_oc4_offset;
      const float *bias_ptr = bias_data + j * C4NUM;
      float *dst_ptr = tmp_out_data + dst_oc4_offset;
-      output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_unit_block * output_unit);
+      output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_w_unit_block * output_unit);
    }
    out_tile_index++;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/winograd_utils.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/winograd_utils.c