!5655 weight quant feature

Merge pull request !5655 from wangchangkai/master
5 years ago · 57215ef000
parent 55650d2ef4 0f2c78253e
commit 57215ef000
18 changed files with 479 additions and 185 deletions
--- a/mindspore/lite/src/model.cc
+++ b/mindspore/lite/src/model.cc
@ -103,8 +103,9 @@ int ModelImpl::BuildOps() {
    auto cNode = meta_graph_->nodes()->GetAs<schema::CNode>(i);
    auto name = cNode->name()->str();
    auto srcPrim = cNode->primitive();
-
-    this->ops_[name] = PrimitiveC::UnPackFromSchemaPrimitive(const_cast<schema::Primitive *>(srcPrim));
+    auto prim = PrimitiveC::UnPackFromSchemaPrimitive(const_cast<schema::Primitive *>(srcPrim));
+    prim->SetQuantType(cNode->quantType());
+    this->ops_[name] = prim;
  }
  return 0;
 }
--- a/mindspore/lite/src/ops/primitive_c.cc
+++ b/mindspore/lite/src/ops/primitive_c.cc
@ -688,6 +688,10 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitive(const schema::Primitive *primi
  }
  return nullptr;
 }
+void PrimitiveC::SetQuantType(schema::QuantType quant_type) {
+  this->quant_type_ = quant_type;
+}
+schema::QuantType PrimitiveC::GetQuantType() const { return quant_type_;}
 #endif

 int PrimitiveC::Type() const {
--- a/mindspore/lite/src/ops/primitive_c.h
+++ b/mindspore/lite/src/ops/primitive_c.h
@ -145,6 +145,9 @@ class PrimitiveC {

  int Type() const;

+  void SetQuantType(schema::QuantType quant_type);
+  schema::QuantType GetQuantType() const;
+
 protected:
  template <typename T, typename = std::enable_if<std::is_base_of<PrimitiveC, T>::value>>
  static PrimitiveC *NewPrimitiveC(const schema::Primitive *primitive) {
@ -194,6 +197,7 @@ class PrimitiveC {
  const schema::Primitive *primitive_ = nullptr;
  char *primitive_buf_ = nullptr;
  bool infer_flag_ = true;
+  schema::QuantType quant_type_{schema::QuantType_QUANT_NONE};
 };
 #endif
 }  // namespace lite
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@ -331,4 +331,46 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {

  return RET_OK;
 }
+int ConvolutionBaseCPUKernel::RestoreFilter(lite::tensor::Tensor *input_tensor) {
+  MS_ASSERT(input_tensor != nullptr);
+  if (input_tensor->GetQuantParams().empty()) {
+    MS_LOG(ERROR) << "no quant param";
+    return RET_ERROR;
+  }
+  const auto* quant_data = static_cast<const uint8_t*>(input_tensor->Data());
+  auto* dequant_data = static_cast<float *>(malloc(input_tensor->DataSize() * sizeof(float)));
+  if (dequant_data == nullptr) {
+    MS_LOG(ERROR) << "malloc faile";
+    return RET_ERROR;
+  }
+
+  if (input_tensor->GetQuantParams().size() != kPerTensor) {
+    size_t channels = static_cast<size_t>(input_tensor->Batch());
+    if (input_tensor->GetQuantParams().size() != channels) {
+      MS_LOG(ERROR) << "Quant param not equal channel num " << input_tensor->GetQuantParams().size() << channels;
+      return RET_ERROR;
+    }
+    size_t per_channel_size = input_tensor->DataSize() / channels;
+    auto quant_param = input_tensor->GetQuantParams();
+    for (size_t i = 0; i < channels; i++) {
+      auto param = quant_param.at(i);
+      auto scale = param.scale;
+      auto zero_point = param.zeroPoint;
+      for (size_t j = 0; j < per_channel_size; j++) {
+        dequant_data[per_channel_size * i + j] = static_cast<float>(
+          (quant_data[per_channel_size * i + j] - zero_point) * scale);
+      }
+    }
+  } else {
+    auto quant_param = input_tensor->GetQuantParams();
+    auto param = quant_param.front();
+    auto scale = param.scale;
+    auto zero_point = param.zeroPoint;
+    for (int64_t j = 0; j < input_tensor->DataSize(); j++) {
+      dequant_data[j] = static_cast<float>((quant_data[j] - zero_point) * scale);
+    }
+  }
+  input_tensor->SetData(dequant_data);
+  return RET_OK;
+}
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
@ -60,6 +60,7 @@ class ConvolutionBaseCPUKernel : public LiteKernel {
  int SetQuantMultiplier();
  int CheckResizeValid();
  void FreeQuantParam();
+  static int RestoreFilter(lite::tensor::Tensor *input_tensor);

 protected:
  int tile_num_;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@ -239,6 +239,12 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func);
  }

+  auto *weight_tensor = inputs.at(kWeightIndex);
+  auto *restore_data = weight_tensor->Data();
+  if (primitive->GetQuantType() == schema::QuantType_WeightQuant) {
+    ConvolutionBaseCPUKernel::RestoreFilter(inputs.at(kWeightIndex));
+  }
+
  kernel::LiteKernel *kernel;
  if (kernel_h == 1 && kernel_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
@ -263,6 +269,12 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
    return nullptr;
  }
+
+  if (primitive->GetQuantType() == schema::QuantType_WeightQuant) {
+    weight_tensor->FreeData();
+    weight_tensor->SetData(restore_data);
+  }
+
  return kernel;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@ -131,6 +131,13 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
                                               const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_DepthwiseConv2D);
+
+  auto *weight_tensor = inputs.at(kWeightIndex);
+  auto *restore_data = weight_tensor->Data();
+  if (primitive->GetQuantType() == schema::QuantType_WeightQuant) {
+    ConvolutionBaseCPUKernel::RestoreFilter(inputs.at(kWeightIndex));
+  }
+
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  kernel::LiteKernel *kernel;
  if (conv_param->input_channel_ < 32) {
@ -149,6 +156,12 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    return nullptr;
  }
+
+  if (primitive->GetQuantType() == schema::QuantType_WeightQuant) {
+    weight_tensor->FreeData();
+    weight_tensor->SetData(restore_data);
+  }
+
  return kernel;
 }

--- a/mindspore/lite/tools/anf_exporter/anf_exporter.cc
+++ b/mindspore/lite/tools/anf_exporter/anf_exporter.cc
@ -64,7 +64,8 @@ int AnfExporter::ConvertQuantParam(const std::unique_ptr<schema::MetaGraphT> &me
  MS_ASSERT(dst_node != nullptr);
  // add quant param
  dst_node->quantType = primitive->GetQuantType();
-  if (dst_node->quantType == schema::QuantType_PostTraining || dst_node->quantType == schema::QuantType_AwareTraining) {
+  if (dst_node->quantType == schema::QuantType_PostTraining || dst_node->quantType == schema::QuantType_AwareTraining
+      || dst_node->quantType == schema::QuantType_WeightQuant) {
    MS_LOG(DEBUG) << "node: " << dst_node->name << " add QuantParam";
    // activation
    auto input_quant_params = primitive->GetInputQuantParams();
@ -103,7 +104,7 @@ int AnfExporter::ConvertQuantParam(const std::unique_ptr<schema::MetaGraphT> &me
      }
    } else {
      for (auto output_quant_param : output_quant_params[0]) {
-        if (tensor_output->quantParams.empty()) {
+        if (tensor_output->quantParams.empty() && dst_node->quantType != schema::QuantType_WeightQuant) {
          std::unique_ptr<schema::QuantParamT> output_quant_param_ptr =
            std::make_unique<schema::QuantParamT>(output_quant_param);
          MS_LOG(DEBUG) << "[output]node: " << dst_node->name << " scale: " << output_quant_param_ptr->scale
--- a/mindspore/lite/tools/converter/anf_transform.cc
+++ b/mindspore/lite/tools/converter/anf_transform.cc
@ -26,6 +26,7 @@
 #include "tools/optimizer/fusion/constant_folding_fusion.h"
 #include "tools/converter/quantizer/post_training_quantizer.h"
 #include "tools/converter/quantizer/quant_cast.h"
+#include "tools/converter/quantizer/weight_quantizer.h"

 using std::string;
 namespace mindspore {
@ -57,11 +58,20 @@ FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &old_graph, const conver
  FuncGraphPtr new_graph = optimizer->Optimize(old_graph);

  // quant
-  if (config != nullptr && config->quantType == schema::QuantType_PostTraining) {
-    this->mQuantizer = std::make_unique<quant::PostTrainingQuantizer>(new_graph, config->configFile, 8);
-    if (mQuantizer == nullptr) {
-      MS_LOG(ERROR) << "New PostTrainingQuantizer failed";
-      return nullptr;
+  if (config != nullptr) {
+    if (config->quantType == schema::QuantType_PostTraining) {
+      this->mQuantizer = std::make_unique<quant::PostTrainingQuantizer>(new_graph, config->configFile, 8);
+      if (mQuantizer == nullptr) {
+        MS_LOG(ERROR) << "New PostTrainingQuantizer failed";
+        return nullptr;
+      }
+    } else if (config->quantType == schema::QuantType_WeightQuant) {
+      this->mQuantizer = std::make_unique<quant::WeightQuantizer>(new_graph, config->quantSize,
+        config->convWeightQuantChannelThreshold, config->bitNum);
+      if (mQuantizer == nullptr) {
+        MS_LOG(ERROR) << "New PostTrainingQuantizer failed";
+        return nullptr;
+      }
    }
  }
  if (mQuantizer != nullptr) {
@ -71,12 +81,14 @@ FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &old_graph, const conver
      MS_LOG(ERROR) << "Quant failed " << status;
      return nullptr;
    }
-    quant::QuantCast quant_cast;
-    quant_cast.SetInputDataDType(kNumberTypeFloat32);
-    status = quant_cast.Run(new_graph);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "add QuantCast error";
-      return nullptr;
+    if (config->quantType == schema::QuantType_PostTraining) {
+      quant::QuantCast quant_cast;
+      quant_cast.SetInputDataDType(kNumberTypeFloat32);
+      status = quant_cast.Run(new_graph);
+      if (status != RET_OK) {
+        MS_LOG(ERROR) << "add QuantCast error";
+        return nullptr;
+      }
    }
  }

--- a/mindspore/lite/tools/converter/converter_flags.cc
+++ b/mindspore/lite/tools/converter/converter_flags.cc
@ -36,6 +36,8 @@ Flags::Flags() {
  AddFlag(&Flags::stdDev, "stdDev", "Standard deviation value for aware-quantization", "128");
  AddFlag(&Flags::mean, "mean", "Mean value for aware-quantization", "-0.5");
  AddFlag(&Flags::quantSize, "quantSize", "Weight quantization size threshold", "0");
+  AddFlag(&Flags::convWeightQuantChannelThreshold, "convWeightQuantChannelThreshold",
+    "convWeightQuantChannelThreshold", "16");
  AddFlag(&Flags::configFile, "config_file", "Configuration for post-training.", "");
  AddFlag(&Flags::formatTrans, "formatTrans", "whether transform format. true | false", "true");
 }
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/weight_format_hardcode_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/weight_format_hardcode_pass.cc
@ -191,6 +191,7 @@ STATUS WeightFormatHardCodePass::HardCodeTFLITE(const std::unique_ptr<CNodeT> &n
  switch (this->quantType) {
    case QuantType_AwareTraining:
    case QuantType_PostTraining:
+    case QuantType_WeightQuant:
    case QuantType_QUANT_NONE: {
      if (opType == schema::PrimitiveType_Conv2D) {
        weightTensor->format = schema::Format_KHWC;
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/weight_format_transform_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/weight_format_transform_pass.cc
@ -31,7 +31,7 @@ void WeightFormatTransformPass::SetDstFormat(Format format) { this->dstFormat =

 STATUS WeightFormatTransformPass::Run(MetaGraphT *graph) {
  MS_ASSERT(graph != nullptr);
-  if (this->quantType == QuantType_AwareTraining) {
+  if (this->quantType == QuantType_AwareTraining || this->quantType == QuantType_WeightQuant) {
    auto status = QuantDataFormatTrans(graph);
    if (status != RET_OK) {
      MS_LOG(ERROR) << "QuantDataFormatTrans failed: " << status;
--- a/mindspore/lite/tools/converter/quantizer/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/quantizer/CMakeLists.txt
@ -11,6 +11,7 @@ add_library(quantizer_mid OBJECT
    ${CMAKE_CURRENT_SOURCE_DIR}/general_bitpacking.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/post_training_quantizer.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/quant_cast.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/weight_quantizer.cc
    )

 if(ENABLE_ASAN)
--- a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
@ -530,7 +530,8 @@ STATUS PostTrainingQuantizer::DoWeightQuant(AnfNodePtr weight, std::shared_ptr<P
    return RET_ERROR;
  }
  auto status =
-    QuantFilter(paramValue, primitive_c, QuantType_PostTraining, quant_max, quant_min, bit_num, perchanel, depthwise);
+    QuantFilter<int8_t>(paramValue, primitive_c, QuantType_PostTraining, quant_max,
+      quant_min, bit_num, perchanel, depthwise);
  if (status != RET_OK) {
    MS_LOG(ERROR) << "QuantFilter failed: " << status;
    return status;
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
@ -279,171 +279,6 @@ STATUS CalQuantizationParams(schema::QuantParamT *quantParam, double mMin, doubl
  return RET_OK;
 }

-STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primitive_c, QuantType quantType,
-                   int quant_max, int quant_min, size_t bitNum, bool per_channel, bool depth_wise) {
-  auto dims = weight->tensor_shape();
-  if (per_channel) {
-    if (dims.size() != 4) {
-      MS_LOG(ERROR) << "weight dims size error: " << dims.size() << " Back to per layer.";
-      per_channel = false;
-    } else {
-      uint32_t channels = dims[0];
-      if (channels == 0) {
-        MS_LOG(ERROR) << "channels is 0";
-        return RET_ERROR;
-      }
-    }
-  }
-
-  vector<schema::QuantParamT> quant_params;
-  size_t elem_count = weight->tensor_shape_size();
-  auto *raw_datas = static_cast<float *>(weight->tensor_addr());
-  if (raw_datas == nullptr) {
-    MS_LOG(ERROR) << "rawDatas is nullptr";
-    return RET_ERROR;
-  }
-  vector<int8_t> quant_datas(elem_count);
-
-  if (per_channel) {
-    // notice:
-    // at now for tflite model, Conv2D's weight format is KHWC, so is DepthwiseConv2D
-    // if TransWeightFormat is done before PostTraingingQuantization, the DepthwiseCon2D's weight is CHWK
-    if (depth_wise) {
-      // channel at last
-      auto channels = dims[3];
-      if (channels == 0) {
-        MS_LOG(ERROR) << "channels is zero";
-        return RET_ERROR;
-      }
-      size_t one_filter_size = elem_count / channels;
-
-      for (int i = 0; i < channels; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-        // find min and max
-        for (size_t j = 0; j < one_filter_size; j++) {
-          auto index = i + j * channels;
-          if (index >= elem_count) {
-            MS_LOG(ERROR) << "over flow!";
-            return RET_ERROR;
-          }
-          min = std::min(min, raw_datas[index]);
-          max = std::max(max, raw_datas[index]);
-        }
-        schema::QuantParamT quant_param;
-        STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bitNum);
-        if (status != RET_OK) {
-          MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
-          return status;
-        }
-        quant_params.emplace_back(quant_param);
-        // do quantization
-        for (uint32_t j = 0; j < one_filter_size; j++) {
-          auto index = i + j * channels;
-          if (index >= elem_count) {
-            MS_LOG(ERROR) << "over flow!";
-            return RET_ERROR;
-          }
-          float raw_data = raw_datas[index];
-          auto quant_data = QuantizeData<int8_t>(raw_data, quant_param, quant_max, quant_min);
-          quant_datas[index] = quant_data;
-        }
-      }
-      auto ret = memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(),
-                          elem_count * sizeof(int8_t));
-      if (ret != EOK) {
-        MS_LOG(ERROR) << "memcpy error: " << ret;
-        return RET_ERROR;
-      }
-      weight->set_tensor_size(elem_count * sizeof(int8_t));
-    } else {
-      // channel at first
-      auto channels = dims[0];
-      if (channels == 0) {
-        MS_LOG(ERROR) << "channels is zero";
-        return RET_ERROR;
-      }
-      size_t one_filter_size = elem_count / channels;
-
-      for (int i = 0; i < channels; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-        // find min and max
-        for (size_t j = 0; j < one_filter_size; j++) {
-          auto index = j + i * one_filter_size;
-          if (index >= elem_count) {
-            MS_LOG(ERROR) << "over flow!";
-            return RET_ERROR;
-          }
-          min = std::min(min, raw_datas[index]);
-          max = std::max(max, raw_datas[index]);
-        }
-        schema::QuantParamT quant_param;
-        STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bitNum);
-        if (status != RET_OK) {
-          MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
-          return status;
-        }
-        quant_params.emplace_back(quant_param);
-        // do quantization
-        for (uint32_t j = 0; j < one_filter_size; j++) {
-          auto index = j + i * one_filter_size;
-          if (index >= elem_count) {
-            MS_LOG(ERROR) << "over flow!";
-            return RET_ERROR;
-          }
-          float raw_data = raw_datas[index];
-          auto quant_data = QuantizeData<int8_t>(raw_data, quant_param, quant_max, quant_min);
-          quant_datas[index] = quant_data;
-        }
-      }
-      auto ret =
-          memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(), elem_count * sizeof(int8_t));
-      if (ret != EOK) {
-        MS_LOG(ERROR) << "memcpy error: " << ret;
-        return RET_ERROR;
-      }
-      weight->set_tensor_size(elem_count * sizeof(int8_t));
-    }
-
-  } else {
-    // per layer
-    float min = FLT_MAX;
-    float max = -FLT_MIN;
-    for (uint32_t i = 0; i < elem_count; i++) {
-      // find max min
-      min = std::min(min, raw_datas[i]);
-      max = std::max(max, raw_datas[i]);
-    }
-
-    schema::QuantParamT quant_param;
-    STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bitNum);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
-      return status;
-    }
-    quant_params.emplace_back(quant_param);
-    // update data and datatype
-    for (uint32_t i = 0; i < elem_count; i++) {
-      float raw_data = raw_datas[i];
-      auto quant_data = QuantizeData<int8_t>(raw_data, quant_param, quant_max, quant_min);
-      quant_datas[i] = quant_data;
-    }
-    auto ret = memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(), elem_count * sizeof(int8_t));
-    if (ret != EOK) {
-      MS_LOG(ERROR) << "memcpy error: " << ret;
-      return RET_ERROR;
-    }
-    weight->set_tensor_size(elem_count * sizeof(int8_t));
-  }
-  if (quant_params.empty()) {
-    MS_LOG(ERROR) << "quant_params empty";
-    return RET_ERROR;
-  }
-  primitive_c->AddInputQuantParam(quant_params);
-  return RET_OK;
-}
-
 STATUS PostBitPack(float *weight, size_t shapeSize, size_t bitNum) {
  auto *rawDatas = reinterpret_cast<uint8_t *>(weight);
  vector<uint8_t> qDatas(rawDatas, rawDatas + shapeSize);
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.h
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.h
@ -21,6 +21,8 @@
 #include <string>
 #include <cmath>
 #include <array>
+#include <vector>
+#include <algorithm>
 #include "tools/converter/quantizer/quantizer.h"
 #include "src/ops/primitive_c.h"
 #include "include/errorcode.h"
@ -117,10 +119,171 @@ T QuantizeData(float originData, const schema::QuantParamT &quantParam, int quan
    return static_cast<T>(quant_data);
  }();
 }
-
+template <typename T>
 STATUS QuantFilter(ParamValueLitePtr weight, std::shared_ptr<PrimitiveC> primitive_c, QuantType quantType,
-                   int quant_max, int quant_min, size_t bitNum = UINT8_QUANTIZATION, bool per_channel = false,
-                   bool depth_wise = false);
+                   int quant_max, int quant_min, size_t bitNum, bool per_channel, bool depth_wise) {
+  auto dims = weight->tensor_shape();
+  if (per_channel) {
+    if (dims.size() != 4) {
+      MS_LOG(ERROR) << "weight dims size error: " << dims.size() << " Back to per layer.";
+      per_channel = false;
+    } else {
+      uint32_t channels = dims[0];
+      if (channels == 0) {
+        MS_LOG(ERROR) << "channels is 0";
+        return RET_ERROR;
+      }
+    }
+  }
+
+  std::vector<schema::QuantParamT> quant_params;
+  size_t elem_count = weight->tensor_shape_size();
+  auto *raw_datas = static_cast<float *>(weight->tensor_addr());
+  if (raw_datas == nullptr) {
+    MS_LOG(ERROR) << "rawDatas is nullptr";
+    return RET_ERROR;
+  }
+  std::vector<T> quant_datas(elem_count);
+
+  if (per_channel) {
+    // notice:
+    // at now for tflite model, Conv2D's weight format is KHWC, so is DepthwiseConv2D
+    // if TransWeightFormat is done before PostTraingingQuantization, the DepthwiseCon2D's weight is CHWK
+    if (depth_wise) {
+      // channel at last
+      auto channels = dims[3];
+      if (channels == 0) {
+        MS_LOG(ERROR) << "channels is zero";
+        return RET_ERROR;
+      }
+      size_t one_filter_size = elem_count / channels;
+
+      for (int i = 0; i < channels; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+        // find min and max
+        for (size_t j = 0; j < one_filter_size; j++) {
+          auto index = i + j * channels;
+          if (index >= elem_count) {
+            MS_LOG(ERROR) << "over flow!";
+            return RET_ERROR;
+          }
+          min = std::min(min, raw_datas[index]);
+          max = std::max(max, raw_datas[index]);
+        }
+        schema::QuantParamT quant_param;
+        STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bitNum);
+        if (status != RET_OK) {
+          MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
+          return status;
+        }
+        quant_params.emplace_back(quant_param);
+        // do quantization
+        for (uint32_t j = 0; j < one_filter_size; j++) {
+          auto index = i + j * channels;
+          if (index >= elem_count) {
+            MS_LOG(ERROR) << "over flow!";
+            return RET_ERROR;
+          }
+          float raw_data = raw_datas[index];
+          auto quant_data = QuantizeData<T>(raw_data, quant_param, quant_max, quant_min);
+          quant_datas[index] = quant_data;
+        }
+      }
+      auto ret = memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(),
+                          elem_count * sizeof(T));
+      if (ret != EOK) {
+        MS_LOG(ERROR) << "memcpy error: " << ret;
+        return RET_ERROR;
+      }
+      weight->set_tensor_size(elem_count * sizeof(T));
+    } else {
+      // channel at first
+      auto channels = dims[0];
+      if (channels == 0) {
+        MS_LOG(ERROR) << "channels is zero";
+        return RET_ERROR;
+      }
+      size_t one_filter_size = elem_count / channels;
+
+      for (int i = 0; i < channels; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+        // find min and max
+        for (size_t j = 0; j < one_filter_size; j++) {
+          auto index = j + i * one_filter_size;
+          if (index >= elem_count) {
+            MS_LOG(ERROR) << "over flow!";
+            return RET_ERROR;
+          }
+          min = std::min(min, raw_datas[index]);
+          max = std::max(max, raw_datas[index]);
+        }
+        schema::QuantParamT quant_param;
+        STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bitNum);
+        if (status != RET_OK) {
+          MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
+          return status;
+        }
+        quant_params.emplace_back(quant_param);
+        // do quantization
+        for (uint32_t j = 0; j < one_filter_size; j++) {
+          auto index = j + i * one_filter_size;
+          if (index >= elem_count) {
+            MS_LOG(ERROR) << "over flow!";
+            return RET_ERROR;
+          }
+          float raw_data = raw_datas[index];
+          auto quant_data = QuantizeData<T>(raw_data, quant_param, quant_max, quant_min);
+          quant_datas[index] = quant_data;
+        }
+      }
+      auto ret =
+          memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(), elem_count * sizeof(int8_t));
+      if (ret != EOK) {
+        MS_LOG(ERROR) << "memcpy error: " << ret;
+        return RET_ERROR;
+      }
+      weight->set_tensor_size(elem_count * sizeof(T));
+    }
+
+  } else {
+    // per layer
+    float min = FLT_MAX;
+    float max = -FLT_MIN;
+    for (uint32_t i = 0; i < elem_count; i++) {
+      // find max min
+      min = std::min(min, raw_datas[i]);
+      max = std::max(max, raw_datas[i]);
+    }
+
+    schema::QuantParamT quant_param;
+    STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bitNum);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
+      return status;
+    }
+    quant_params.emplace_back(quant_param);
+    // update data and datatype
+    for (uint32_t i = 0; i < elem_count; i++) {
+      float raw_data = raw_datas[i];
+      auto quant_data = QuantizeData<T>(raw_data, quant_param, quant_max, quant_min);
+      quant_datas[i] = quant_data;
+    }
+    auto ret = memcpy_s(raw_datas, weight->tensor_size(), quant_datas.data(), elem_count * sizeof(int8_t));
+    if (ret != EOK) {
+      MS_LOG(ERROR) << "memcpy error: " << ret;
+      return RET_ERROR;
+    }
+    weight->set_tensor_size(elem_count * sizeof(T));
+  }
+  if (quant_params.empty()) {
+    MS_LOG(ERROR) << "quant_params empty";
+    return RET_ERROR;
+  }
+  primitive_c->AddInputQuantParam(quant_params);
+  return RET_OK;
+}

 STATUS PostBitPack(float *weights, size_t shapeSize, size_t bitNum = UINT8_QUANTIZATION);
 }  // namespace quant
--- a/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/converter/quantizer/weight_quantizer.h"
+#include <list>
+#include <string>
+#include <vector>
+#include "src/common/common.h"
+#include "ir/dtype/type_id.h"
+
+using std::string;
+using std::vector;
+
+namespace mindspore {
+namespace lite {
+namespace quant {
+
+WeightQuantizer::WeightQuantizer(FuncGraphPtr graph, const string &weightSize,
+                                 const std::string &convWeightChannelThreshold, const std::string &bitNum)
+    : Quantizer(graph) {
+  auto quantSize = static_cast<size_t>(std::stoull(weightSize));
+  this->bitNum = static_cast<size_t>(std::stoull(bitNum));
+  auto convQuantWeightChannelThreshold = static_cast<size_t>(std::stoull(convWeightChannelThreshold));
+  mStrategy.reset(new QuantStrategy(quantSize, convQuantWeightChannelThreshold));
+}
+
+STATUS WeightQuantizer::DoConvQuantize(const std::list<CNodePtr> &nodes) {
+  for (auto &cnode : nodes) {
+    if (!mStrategy->CanConvOpQuantized(cnode)) {
+      continue;
+    }
+
+    auto primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(cnode->input(0));
+    if (primitive_c == nullptr) {
+      MS_LOG(ERROR) << "primitive_c is nullptr";
+      return RET_ERROR;
+    }
+
+    auto inputNode = cnode->input(2);
+    if (!inputNode->isa<Parameter>()) {
+      return RET_ERROR;
+    }
+
+    auto paramNode = inputNode->cast<ParameterPtr>();
+    if (!paramNode->has_default()) {
+      return RET_ERROR;
+    }
+
+    std::vector<schema::QuantParamT> quant_params;
+    primitive_c->AddInputQuantParam(quant_params);
+
+    auto op_type = (schema::PrimitiveType)primitive_c->Type();
+    bool depthwise = op_type == schema::PrimitiveType_DepthwiseConv2D ? true : false;
+
+    ParamValueLitePtr param_value = std::static_pointer_cast<ParamValueLite>(paramNode->default_param());
+    auto status = QuantFilter<uint8_t>(param_value, primitive_c, QuantType_WeightQuant, 255, 0,
+      bitNum, true, depthwise);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "QuantFilter failed : " << status;
+      return status;
+    }
+    param_value->set_tensor_type(kNumberTypeUInt8);
+    primitive_c->SetQuantType(schema::QuantType_WeightQuant);
+  }
+
+  return RET_OK;
+}
+
+STATUS WeightQuantizer::DoMulQuantize(const std::list<CNodePtr> &nodes) {
+  for (auto &node : nodes) {
+    if (!mStrategy->CanMulOpQuantized(node)) {
+      continue;
+    }
+
+    ParamValueLitePtr param_value = nullptr;
+    for (size_t i = 1; i < node->size(); i++) {
+      auto inputNode = node->input(i);
+      if (inputNode->isa<Parameter>() == true) {
+        auto paramNode = inputNode->cast<ParameterPtr>();
+        if ((paramNode != nullptr) && (paramNode->has_default() == true)) {
+          param_value = std::static_pointer_cast<ParamValueLite>(paramNode->default_param());
+          if ((param_value == nullptr) || (param_value->tensor_size() == 0)
+              || (param_value->tensor_shape().size() != 4)
+              || (param_value->tensor_addr() == nullptr)
+              || (param_value->tensor_type() != mindspore::kNumberTypeFloat32)) {
+            param_value = nullptr;
+            continue;
+          } else {
+            break;
+          }
+        }
+      }
+    }
+    if (param_value == nullptr) {
+      MS_LOG(ERROR) << "No valid input param node !";
+      return RET_ERROR;;
+    }
+
+    auto primitive_c = GetValueNode<std::shared_ptr<PrimitiveC>>(node->input(0));
+    if (primitive_c == nullptr) {
+      MS_LOG(ERROR) << "primitive_c is nullptr";
+      return RET_ERROR;
+    }
+
+    auto status = QuantFilter<uint8_t>(param_value, primitive_c, QuantType_WeightQuant, 255, 0, bitNum, true, false);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "QuantFilter failed : " << status;
+      return status;
+    }
+    param_value->set_tensor_type(kNumberTypeUInt8);
+    primitive_c->SetQuantType(schema::QuantType_WeightQuant);
+  }
+
+  return RET_OK;
+}
+
+STATUS WeightQuantizer::DoQuantize(FuncGraphPtr funcGraph) {
+  auto ret = RET_OK;
+  auto cnodes = funcGraph->GetOrderedCnodes();
+  ret = DoConvQuantize(cnodes);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "DoConvQuantize failed :" << ret;
+    return ret;
+  }
+  ret = DoMulQuantize(cnodes);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "DoMulQuantize failed :" << ret;
+    return ret;
+  }
+  return ret;
+}
+}  // namespace quant
+}  // namespace lite
+}  // namespace mindspore
+
--- a/mindspore/lite/tools/converter/quantizer/weight_quantizer.h
+++ b/mindspore/lite/tools/converter/quantizer/weight_quantizer.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef WEIGHT_QUANTIZER_H
+#define WEIGHT_QUANTIZER_H
+
+#include <memory>
+#include <list>
+#include <string>
+#include "tools/converter/quantizer/quantizer.h"
+#include "tools/converter/quantizer/quantize_util.h"
+#include "ir/func_graph.h"
+#include "ir/anf.h"
+#include "include/model.h"
+#include "base/base.h"
+#include "abstract/dshape.h"
+
+namespace mindspore {
+namespace lite {
+namespace quant {
+class WeightQuantizer : public Quantizer {
+ public:
+  WeightQuantizer(FuncGraphPtr graph, const std::string& weightSize,
+                  const std::string& covWeightChannelThreshold, const std::string& bitNum);
+
+  ~WeightQuantizer() = default;
+
+  STATUS DoQuantize(FuncGraphPtr funcGraph) override;
+  STATUS DoConvQuantize(const std::list<CNodePtr> &nodes);
+  STATUS DoMulQuantize(const std::list<CNodePtr> &nodes);
+
+ private:
+  std::unique_ptr<QuantStrategy> mStrategy;
+  size_t bitNum;
+};
+}  // namespace quant
+}  // namespace lite
+}  // namespace mindspore
+#endif
+