离线量化方法采用了nvidia的方案,采用了对称量化方案。对激活值量化,利用kl散度选择最优的scale值;对参数选择了max_abs方法进行逐通道量化,每一个卷积核有不同的scale值。
    image.png

    1. Calibration::Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
    2. : _originaleModel(model) {
    3. // when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
    4. int channles = 3;
    5. rapidjson::Document document;
    6. {
    7. std::ifstream fileNames(configPath.c_str());
    8. std::ostringstream output;
    9. output << fileNames.rdbuf();
    10. auto outputStr = output.str();
    11. document.Parse(outputStr.c_str());
    12. if (document.HasParseError()) {
    13. MNN_ERROR("Invalid json\n");
    14. return;
    15. }
    16. }
    17. auto picObj = document.GetObject();
    18. ImageProcess::Config config;
    19. config.filterType = BILINEAR;
    20. config.destFormat = BGR;
    21. {
    22. if (picObj.HasMember("format")) {
    23. auto format = picObj["format"].GetString();
    24. static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}};
    25. if (formatMap.find(format) != formatMap.end()) {
    26. config.destFormat = formatMap.find(format)->second;
    27. }
    28. }
    29. }
    30. if (config.destFormat == GRAY) {
    31. channles = 1;
    32. }
    33. config.sourceFormat = RGBA;
    34. std::string imagePath;
    35. _imageNum = 0;
    36. {
    37. if (picObj.HasMember("mean")) {
    38. auto mean = picObj["mean"].GetArray();
    39. int cur = 0;
    40. for (auto iter = mean.begin(); iter != mean.end(); iter++) {
    41. config.mean[cur++] = iter->GetFloat();
    42. }
    43. }
    44. if (picObj.HasMember("normal")) {
    45. auto normal = picObj["normal"].GetArray();
    46. int cur = 0;
    47. for (auto iter = normal.begin(); iter != normal.end(); iter++) {
    48. config.normal[cur++] = iter->GetFloat();
    49. }
    50. }
    51. if (picObj.HasMember("width")) {
    52. _width = picObj["width"].GetInt();
    53. }
    54. if (picObj.HasMember("height")) {
    55. _height = picObj["height"].GetInt();
    56. }
    57. if (picObj.HasMember("path")) {
    58. imagePath = picObj["path"].GetString();
    59. }
    60. if (picObj.HasMember("used_image_num")) {
    61. _imageNum = picObj["used_image_num"].GetInt();
    62. }
    63. if (picObj.HasMember("feature_quantize_method")) {
    64. std::string method = picObj["feature_quantize_method"].GetString();
    65. if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
    66. _featureQuantizeMethod = method;
    67. } else {
    68. MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
    69. return;
    70. }
    71. }
    72. if (picObj.HasMember("weight_quantize_method")) {
    73. std::string method = picObj["weight_quantize_method"].GetString();
    74. if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
    75. _weightQuantizeMethod = method;
    76. } else {
    77. MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
    78. return;
    79. }
    80. }
    81. DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
    82. DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
    83. }
    84. std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
    85. _process = process;
    86. // read images file names
    87. Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);
    88. _initMNNSession(modelBuffer, bufferSize, channles);
    89. _initMaps();
    90. }

    Calibration量化流程如下所示:

    • 读取量化配置文件,设置好相应量化参数
    • 设置图片预处理,并读取图片
    • 初始化好用于统计tensor信息的推理模型
    • 初始化好统计不同tensor的TensorStatistic

      1. 初始化Session用于统计Tensor信息
      2. void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
      3. _interpreter.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
      4. MNN::ScheduleConfig config;
      5. _session = _interpreter->createSession(config);
      6. _inputTensor = _interpreter->getSessionInput(_session, NULL);
      7. _inputTensorDims.resize(4);
      8. auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
      9. DCHECK(4 == _inputTensor->dimensions()) << "Only support 4 dimensions input";
      10. if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
      11. _inputTensorDims[0] = 1;
      12. _inputTensorDims[1] = _height;
      13. _inputTensorDims[2] = _width;
      14. _inputTensorDims[3] = channels;
      15. } else if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NC4HW4) {
      16. _inputTensorDims[0] = 1;
      17. _inputTensorDims[1] = channels;
      18. _inputTensorDims[2] = _height;
      19. _inputTensorDims[3] = _width;
      20. } else {
      21. DLOG(ERROR) << "Input Data Format ERROR!";
      22. }
      23. if (_featureQuantizeMethod == "KL") {
      24. _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
      25. _interpreter->resizeSession(_session);
      26. } else if (_featureQuantizeMethod == "ADMM") {
      27. DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
      28. _inputTensorDims[0] = _imageNum;
      29. _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
      30. _interpreter->resizeSession(_session);
      31. }
      32. _interpreter->releaseModel();
      33. }
      1. void Calibration::_initMaps() {
      2. _featureInfo.clear();
      3. _opInfo.clear(); // 记录算子对应的输入和输出tensor的指针
      4. _tensorMap.clear();
      5. // std::set<std::string> Helper::gNeedFeatureOp = {"Convolution", "ConvolutionDepthwise", "Eltwise", "Pooling"};
      6. // 量化op包括卷积、深度卷积、EltwiseAdd和Pooling
      7. // run mnn once, initialize featureMap, opInfo map
      8. // 在每个op算子计算前调用before
      9. MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
      10. _opInfo[info->name()].first = nTensors;
      11. // 如果该算子为量化op,将所有输入tensor加入会进行量化信息统计的map中,且利用TensorStatistic,对每个tensor进行统计。
      12. if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
      13. for (auto t : nTensors) {
      14. if (_featureInfo.find(t) == _featureInfo.end()) {
      15. _featureInfo[t] = std::shared_ptr<TensorStatistic>(
      16. new TensorStatistic(t, _featureQuantizeMethod, info->name() + "__input"));
      17. }
      18. }
      19. }
      20. return false;
      21. };
      22. // 在每个op算子计算后调用end
      23. MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
      24. const MNN::OperatorInfo* info) {
      25. _opInfo[info->name()].second = nTensors;
      26. // 如果该算子为量化op,将所有输出tensor加入会进行量化信息统计的map中,且利用TensorStatistic,对每个tensor进行统计。
      27. if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
      28. for (auto t : nTensors) {
      29. if (_featureInfo.find(t) == _featureInfo.end()) {
      30. _featureInfo[t] =
      31. std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, info->name()));
      32. }
      33. }
      34. }
      35. return true;
      36. };
      37. // 进行推理同时设置两个回调函数,记录待统计的tensor
      38. _interpreter->runSessionWithCallBackInfo(_session, before, after);
      39. // 记录tensor索引到tensor指针的映射
      40. for (auto& op : _originaleModel->oplists) {
      41. if (_opInfo.find(op->name) == _opInfo.end()) {
      42. continue;
      43. }
      44. for (int i = 0; i < op->inputIndexes.size(); ++i) {
      45. _tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
      46. }
      47. for (int i = 0; i < op->outputIndexes.size(); ++i) {
      48. _tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
      49. }
      50. }
      51. if (_featureQuantizeMethod == "KL") {
      52. // 输入tensor的统计方法不适用KL
      53. // set the tensor-statistic method of input tensor as THRESHOLD_MAX
      54. auto inputTensorStatistic = _featureInfo.find(_inputTensor);
      55. if (inputTensorStatistic != _featureInfo.end()) {
      56. inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
      57. }
      58. }
      59. }
      1. void Calibration::runQuantizeModel() {
      2. // 计算激活值的scale值
      3. if (_featureQuantizeMethod == "KL") {
      4. _computeFeatureScaleKL();
      5. } else if (_featureQuantizeMethod == "ADMM") {
      6. _computeFeatureScaleADMM();
      7. }
      8. // 统计weight的信息统计,创建量化op,生成网络
      9. _updateScale();
      10. // 对于不支持量化的算子,插入反量化算子转化为float,对输出也插入反量化算子
      11. _insertDequantize();
      12. }
      1. void Calibration::_computeFeatureScaleKL() {
      2. // 计算特征图范围
      3. _computeFeatureMapsRange();
      4. // 计算特征图分布
      5. _collectFeatureMapsDistribution();
      6. _scales.clear();
      7. for (auto& iter : _featureInfo) {
      8. AUTOTIME;
      9. // 生成tensor的scale值
      10. _scales[iter.first] = iter.second->finishAndCompute();
      11. }
      12. //_featureInfo.clear();//No need now
      13. }

      ```cpp void resetUpdatedRangeFlags() {

      1. mUpdatedRangeFlags = false;

      }

    void TensorStatistic::updateRange() { if (mUpdatedRangeFlags) { return; } mUpdatedRangeFlags = true; mOriginTensor->copyToHostTensor(mHostTensor.get()); int batch = mHostTensor->batch(); int channel = mHostTensor->channel(); int width = mHostTensor->width(); int height = mHostTensor->height(); auto area = width * height;

    1. for (int n = 0; n < batch; ++n)
    2. {
    3. auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
    4. for (int c = 0; c < channel; ++c)
    5. {
    6. int cIndex = c;
    7. // mMergeChannel默认为true,即对激活值不采用逐通道量化,
    8. if (mMergeChannel)
    9. {
    10. cIndex = 0;
    11. }
    12. // 统计最大最小值
    13. auto minValue = mRangePerChannel[cIndex].first;
    14. auto maxValue = mRangePerChannel[cIndex].second;
    15. auto dataChannel = dataBatch + c * mHostTensor->stride(1);
    16. for (int v = 0; v < area; ++v)
    17. {
    18. minValue = std::min(minValue, dataChannel[v]);
    19. maxValue = std::max(maxValue, dataChannel[v]);
    20. }
    21. mRangePerChannel[cIndex].first = minValue;
    22. mRangePerChannel[cIndex].second = maxValue;
    23. }
    24. }

    }

    void Calibration::_computeFeatureMapsRange() { // feed input data according to input images int count = 0; for (const auto& img : _imgaes) { for (auto& iter : _featureInfo) { // 由于输入输出tensor可能会重叠,出现重复统计的情况 // 设置flag表示是否range被统计过 iter.second->resetUpdatedRangeFlags(); } count++; // 图片预处理 Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);

    1. MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
    2. const MNN::OperatorInfo* info) {
    3. for (auto t : nTensors) {
    4. if (_featureInfo.find(t) != _featureInfo.end()) {
    5. // 统计输入tensor的range
    6. _featureInfo[t]->updateRange();
    7. }
    8. }
    9. return true;
    10. };
    11. MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
    12. const MNN::OperatorInfo* info) {
    13. for (auto t : nTensors) {
    14. if (_featureInfo.find(t) != _featureInfo.end()) {
    15. // 统计输出tensor的range
    16. _featureInfo[t]->updateRange();
    17. }
    18. }
    19. return true;
    20. };
    21. _interpreter->runSessionWithCallBackInfo(_session, before, after);
    22. MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
    23. fflush(stdout);
    24. }
    25. MNN_PRINT("\n");

    }

    1. ```cpp
    2. void TensorStatistic::resetDistribution()
    3. {
    4. for (int i = 0; i < mIntervals.size(); ++i)
    5. {
    6. int cIndex = i;
    7. // 类似之前
    8. if (mMergeChannel)
    9. {
    10. cIndex = 0;
    11. }
    12. // 统计最大绝对值,nvidia方案针对relu后的激活值进行量化,因此所有tensor值都为正,此处应该也假设了所有tensor生成的值大于0
    13. // 否则此处代码逻辑有问题
    14. auto maxValue = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
    15. mValidChannel[cIndex] = maxValue > 0.00001f;
    16. mIntervals[cIndex] = 0.0f;
    17. if (mValidChannel[cIndex])
    18. {
    19. // 预先计算2048 / max
    20. // 之后乘tensor的值可以得到对应的bin的位置
    21. mIntervals[cIndex] = (float)mBinNumber / maxValue;
    22. }
    23. }
    24. for (auto &c : mDistribution)
    25. {
    26. // 防止出现某个bin没有元素,平滑操作
    27. std::fill(c.begin(), c.end(), 1.0e-07);
    28. }
    29. // MNN_PRINT("==> %s max: %f\n", mName.c_str(),std::max(fabsf(mRangePerChannel[0].second),
    30. // fabsf(mRangePerChannel[0].first)));
    31. }
    32. void TensorStatistic::updateDistribution()
    33. {
    34. if (mUpdatedDistributionFlag)
    35. {
    36. return;
    37. }
    38. mUpdatedDistributionFlag = true;
    39. mOriginTensor->copyToHostTensor(mHostTensor.get());
    40. int batch = mHostTensor->batch();
    41. int channel = mHostTensor->channel();
    42. int width = mHostTensor->width();
    43. int height = mHostTensor->height();
    44. auto area = width * height;
    45. for (int n = 0; n < batch; ++n)
    46. {
    47. auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
    48. for (int c = 0; c < channel; ++c)
    49. {
    50. int cIndex = c;
    51. if (mMergeChannel)
    52. {
    53. cIndex = 0;
    54. }
    55. if (!mValidChannel[cIndex])
    56. {
    57. continue;
    58. }
    59. auto multi = mIntervals[cIndex];
    60. auto target = mDistribution[cIndex].data();
    61. auto dataChannel = dataBatch + c * mHostTensor->stride(1);
    62. for (int v = 0; v < area; ++v)
    63. {
    64. auto data = dataChannel[v];
    65. if (data == 0)
    66. {
    67. continue;
    68. }
    69. // 生成bin的index并将对应bin的值加1
    70. int index = static_cast<int>(fabs(data) * multi);
    71. index = std::min(index, mBinNumber - 1);
    72. target[index] += 1.0f;
    73. }
    74. }
    75. }
    76. }
    77. void Calibration::_collectFeatureMapsDistribution() {
    78. for (auto& iter : _featureInfo) {
    79. // 将tensor分布装在2048个bin中作为原始激活值的分布,对每个tensor分布进行初始化
    80. iter.second->resetDistribution();
    81. }
    82. // feed input data according to input images
    83. // 对输入输出tensor的分布进行更新
    84. MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
    85. for (auto t : nTensors) {
    86. if (_featureInfo.find(t) != _featureInfo.end()) {
    87. _featureInfo[t]->updateDistribution();
    88. }
    89. }
    90. return true;
    91. };
    92. MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
    93. for (auto t : nTensors) {
    94. if (_featureInfo.find(t) != _featureInfo.end()) {
    95. _featureInfo[t]->updateDistribution();
    96. }
    97. }
    98. return true;
    99. };
    100. int count = 0;
    101. for (const auto& img : _imgaes) {
    102. count++;
    103. for (auto& iter : _featureInfo) {
    104. iter.second->resetUpdatedDistributionFlag();
    105. }
    106. Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
    107. _interpreter->runSessionWithCallBackInfo(_session, before, after);
    108. MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
    109. fflush(stdout);
    110. }
    111. MNN_PRINT("\n");
    112. }
    1. // 每个tensor生成对应的scale值
    2. std::vector<float> TensorStatistic::finishAndCompute()
    3. {
    4. std::vector<float> scaleValue(mDistribution.size(), 0.0f);
    5. // 默认为true
    6. if (mMergeChannel)
    7. {
    8. if (!mValidChannel[0])
    9. {
    10. return scaleValue;
    11. }
    12. float sum = 0.0f;
    13. auto &distribution = mDistribution[0];
    14. // 生成概率分布
    15. std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
    16. std::for_each(distribution.begin(), distribution.end(), [sum](float &n) { n /= sum; });
    17. // 对分布计算阈值,threshold表示bin的位置
    18. auto threshold = _computeThreshold(distribution);
    19. // 根据threshold计算scale值
    20. auto scale = ((float)threshold + 0.5) / mIntervals[0] / 127.0f;
    21. // MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], scale * 127.0f);
    22. std::fill(scaleValue.begin(), scaleValue.end(), scale);
    23. return scaleValue;
    24. }
    25. for (int c = 0; c < mDistribution.size(); ++c)
    26. {
    27. if (!mValidChannel[c])
    28. {
    29. continue;
    30. }
    31. float sum = 0.0f;
    32. auto &distribution = mDistribution[c];
    33. std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
    34. std::for_each(distribution.begin(), distribution.end(), [sum](float &n) { n /= sum; });
    35. auto threshold = _computeThreshold(distribution);
    36. scaleValue[c] = ((float)threshold + 0.5) / mIntervals[c] / 127.0;
    37. }
    38. return scaleValue;
    39. }
    1. int TensorStatistic::_computeThreshold(const std::vector<float> &distribution)
    2. {
    3. // 128即int8的一半,全为正数
    4. const int targetBinNums = 128;
    5. int threshold = targetBinNums;
    6. if (mThresholdMethod == THRESHOLD_KL)
    7. {
    8. float minKLDivergence = 10000.0f;
    9. float afterThresholdSum = 0.0f;
    10. std::for_each(distribution.begin() + targetBinNums, distribution.end(),
    11. [&](float n) { afterThresholdSum += n; });
    12. for (int i = targetBinNums; i < mBinNumber; ++i)
    13. {
    14. std::vector<float> quantizedDistribution(targetBinNums);
    15. std::vector<float> candidateDistribution(i);
    16. std::vector<float> expandedDistribution(i);
    17. std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
    18. // 计算P分布
    19. candidateDistribution[i - 1] += afterThresholdSum;
    20. afterThresholdSum -= distribution[i];
    21. const float binInterval = (float)i / (float)targetBinNums;
    22. // merge i bins to target bins
    23. // 量化i个bins到int8的128个bins中
    24. for (int j = 0; j < targetBinNums; ++j)
    25. {
    26. const float start = j * binInterval;
    27. const float end = start + binInterval;
    28. const int leftUpper = static_cast<int>(std::ceil(start));
    29. if (leftUpper > start)
    30. {
    31. // 假设start 为 2.4,则添加0.6个左界分布到其中
    32. const float leftScale = leftUpper - start;
    33. quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
    34. }
    35. const int rightLower = static_cast<int>(std::floor(end));
    36. if (rightLower < end)
    37. {
    38. // 假设start 为 2.4,则添加0.4个右界分布到其中
    39. const float rightScale = end - rightLower;
    40. quantizedDistribution[j] += rightScale * distribution[rightLower];
    41. }
    42. std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
    43. [&](float n) { quantizedDistribution[j] += n; });
    44. }
    45. // expand target bins to i bins
    46. // 反量化int8到i个bins中
    47. for (int j = 0; j < targetBinNums; ++j)
    48. {
    49. const float start = j * binInterval;
    50. const float end = start + binInterval;
    51. float count = 0;
    52. const int leftUpper = static_cast<int>(std::ceil(start));
    53. float leftScale = 0.0f;
    54. if (leftUpper > start)
    55. {
    56. // 与之前同理,计算每个int8bin中是否包含左界的一部分,不统计原分布为0的bin
    57. leftScale = leftUpper - start;
    58. if (distribution[leftUpper - 1] != 0)
    59. {
    60. count += leftScale;
    61. }
    62. }
    63. const int rightLower = static_cast<int>(std::floor(end));
    64. float rightScale = 0.0f;
    65. if (rightLower < end)
    66. {
    67. // 计算每个int8bin中是否包含右界的一部分,不统计原分布为0的bin
    68. rightScale = end - rightLower;
    69. if (distribution[rightLower] != 0)
    70. {
    71. count += rightScale;
    72. }
    73. }
    74. std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
    75. if (n != 0)
    76. {
    77. count += 1;
    78. }
    79. });
    80. if (count == 0)
    81. {
    82. continue;
    83. }
    84. // 求均值
    85. const float toExpandValue = quantizedDistribution[j] / count;
    86. if (leftUpper > start && distribution[leftUpper - 1] != 0)
    87. {
    88. // 左界赋值
    89. expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
    90. }
    91. if (rightLower < end && distribution[rightLower] != 0)
    92. {
    93. // 右界赋值
    94. expandedDistribution[rightLower] += toExpandValue * rightScale;
    95. }
    96. for (int k = leftUpper; k < rightLower; ++k)
    97. {
    98. // 中间段不为0的bin赋值
    99. if (distribution[k] != 0)
    100. {
    101. expandedDistribution[k] += toExpandValue;
    102. }
    103. }
    104. }
    105. // 计算kl散度
    106. const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
    107. // std::cout << "=====> KL: " << i << " ==> " << curKL << std::endl;
    108. if (curKL < minKLDivergence)
    109. {
    110. minKLDivergence = curKL;
    111. threshold = i;
    112. }
    113. }
    114. }
    115. else if (mThresholdMethod == THRESHOLD_MAX)
    116. {
    117. threshold = mBinNumber - 1;
    118. }
    119. else
    120. {
    121. // TODO, support other method
    122. MNN_ASSERT(false);
    123. }
    124. return threshold;
    125. }

    下图为上述代码的伪代码:
    P为原始分布将阈值外的分布全部加到最右侧,如果该阈值不错,则P分布与i个bin之前的分布之间KL散度相差不大。
    Q为i个bin之前的分布进行量化再反量化后生成的分布,计算P、Q分布的KL散度相当于模拟了量化误差与阈值选取误差的和。
    image.png

    1. void Calibration::_updateScale() {
    2. for (const auto& op : _originaleModel->oplists) {
    3. const auto opType = op->type;
    4. // 如果不为量化op,忽略
    5. if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
    6. opType != MNN::OpType_Eltwise) {
    7. continue;
    8. }
    9. auto tensorsPair = _opInfo.find(op->name);
    10. if (tensorsPair == _opInfo.end()) {
    11. MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
    12. }
    13. if (opType == MNN::OpType_Eltwise) {
    14. auto param = op->main.AsEltwise();
    15. // Now only support AddInt8 只支持EltwiseType_SUM
    16. if (param->type != MNN::EltwiseType_SUM) {
    17. continue;
    18. }
    19. // 记录scale值
    20. const auto& inputScale0 = _scales[tensorsPair->second.first[0]];
    21. const auto& inputScale1 = _scales[tensorsPair->second.first[1]];
    22. const auto& outputScale = _scales[tensorsPair->second.second[0]];
    23. const int outputScaleSize = outputScale.size();
    24. std::vector<float> outputInvertScale(outputScaleSize);
    25. Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
    26. op->type = MNN::OpType_EltwiseInt8;
    27. op->main.Reset();
    28. op->main.type = MNN::OpParameter_EltwiseInt8;
    29. auto eltwiseInt8Param = new MNN::EltwiseInt8T;
    30. auto input0ScaleParam = new MNN::QuantizedFloatParamT;
    31. auto input1ScaleParam = new MNN::QuantizedFloatParamT;
    32. auto outputScaleParam = new MNN::QuantizedFloatParamT;
    33. input0ScaleParam->tensorScale = inputScale0;
    34. input1ScaleParam->tensorScale = inputScale1;
    35. outputScaleParam->tensorScale = outputInvertScale;
    36. eltwiseInt8Param->inputQuan0 = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
    37. eltwiseInt8Param->inputQuan1 = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
    38. eltwiseInt8Param->outputQuan = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
    39. op->main.value = eltwiseInt8Param;
    40. continue;
    41. }
    42. // below is Conv/DepthwiseConv
    43. const auto& inputScale = _scales[tensorsPair->second.first[0]];
    44. const auto& outputScale = _scales[tensorsPair->second.second[0]];
    45. auto param = op->main.AsConvolution2D();
    46. param->common->inputCount = tensorsPair->second.first[0]->channel();
    47. const int channles = param->common->outputCount;
    48. const int weightSize = param->weight.size();
    49. param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
    50. auto& quantizedParam = param->symmetricQuan;
    51. quantizedParam->scale.resize(channles);
    52. quantizedParam->weight.resize(weightSize);
    53. quantizedParam->bias.resize(channles);
    54. // 针对两种卷积采用不同的量化方式
    55. if (opType == MNN::OpType_Convolution) {
    56. QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
    57. quantizedParam->weight.data(), quantizedParam->bias.data(),
    58. quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
    59. op->type = MNN::OpType_ConvInt8;
    60. } else if (opType == MNN::OpType_ConvolutionDepthwise) {
    61. QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
    62. quantizedParam->weight.data(), quantizedParam->bias.data(),
    63. quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
    64. op->type = MNN::OpType_DepthwiseConvInt8;
    65. }
    66. // 取消relu6
    67. if (param->common->relu6) {
    68. param->common->relu = true;
    69. param->common->relu6 = false;
    70. }
    71. param->weight.clear();
    72. param->bias.clear();
    73. }
    74. }
    1. void Calibration::_insertDequantize() {
    2. // Search All Int Tensors
    3. std::set<int> int8Tensors; // 统计被量化的tensor
    4. std::set<int> int8Outputs; // 统计被量化的且为最终输出的tensor
    5. for (auto& op : _originaleModel->oplists) {
    6. if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {
    7. for (auto index : op->inputIndexes) {
    8. int8Tensors.insert(index);
    9. }
    10. for (auto index : op->outputIndexes) {
    11. int8Tensors.insert(index);
    12. int8Outputs.insert(index);
    13. }
    14. }
    15. }
    16. for (auto& op : _originaleModel->oplists) {
    17. for (auto index : op->inputIndexes) {
    18. auto iter = int8Outputs.find(index);
    19. if (iter != int8Outputs.end()) {
    20. int8Outputs.erase(iter);
    21. }
    22. }
    23. }
    24. // Insert Convert For Not Support Int8 Ops
    25. // 对不支持量化的op进行处理
    26. for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
    27. auto op = iter->get();
    28. const auto opType = op->type;
    29. const auto name = op->name;
    30. // check whether is output op
    31. // if Yes, insert dequantization op after this op
    32. if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
    33. // this is quantized op
    34. iter++;
    35. continue;
    36. }
    37. auto& inputIndexes = op->inputIndexes;
    38. const int inputSize = inputIndexes.size();
    39. // insert dequantization op before this op
    40. // 如果该算子的输入为int8,插入反量化op
    41. for (int i = 0; i < inputSize; ++i) {
    42. const auto curInputIndex = inputIndexes[i];
    43. if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {
    44. continue;
    45. }
    46. auto input = _tensorMap[curInputIndex];
    47. auto inputOpScale = _scales[input];
    48. // construct new op
    49. auto dequantizationOp = new MNN::OpT;
    50. dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
    51. dequantizationOp->name = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
    52. dequantizationOp->type = MNN::OpType_Int8ToFloat;
    53. auto dequantizationParam = new MNN::QuantizedFloatParamT;
    54. dequantizationOp->main.value = dequantizationParam;
    55. dequantizationParam->tensorScale = inputOpScale;
    56. dequantizationOp->inputIndexes.push_back(curInputIndex);
    57. dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
    58. _originaleModel->tensorName.push_back(dequantizationOp->name);
    59. // reset current op's input index at i
    60. inputIndexes[i] = dequantizationOp->outputIndexes[0];
    61. iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
    62. iter++;
    63. }
    64. iter++;
    65. // LOG(INFO) << "insert quantization op after this op if neccessary";
    66. // insert quantization op after this op if neccessary
    67. // 如果该算子输出为int8,则插入量化op
    68. for (int i = 0; i < op->outputIndexes.size(); ++i) {
    69. const auto outputIndex = op->outputIndexes[i];
    70. if (int8Tensors.find(outputIndex) == int8Tensors.end()) {
    71. continue;
    72. }
    73. auto output = _tensorMap[outputIndex];
    74. auto curScale = _scales[output];
    75. // construct one quantization op(FloatToInt8)
    76. auto quantizationOp = new MNN::OpT;
    77. quantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
    78. quantizationOp->name = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
    79. quantizationOp->type = MNN::OpType_FloatToInt8;
    80. auto quantizationParam = new MNN::QuantizedFloatParamT;
    81. quantizationOp->main.value = quantizationParam;
    82. const int channels = curScale.size();
    83. std::vector<float> quantizationScale(channels);
    84. Helper::invertData(quantizationScale.data(), curScale.data(), channels);
    85. quantizationParam->tensorScale = quantizationScale;
    86. quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
    87. quantizationOp->outputIndexes.push_back(outputIndex);
    88. _originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
    89. _originaleModel->tensorName[outputIndex] = quantizationOp->name;
    90. op->outputIndexes[i] = quantizationOp->inputIndexes[0];
    91. iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
    92. iter++;
    93. }
    94. }
    95. // Insert Turn float Op for output
    96. // 对输入算子之后插入反量化op
    97. for (auto index : int8Outputs) {
    98. // construct new op
    99. auto dequantizationOp = new MNN::OpT;
    100. dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
    101. dequantizationOp->name = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
    102. dequantizationOp->type = MNN::OpType_Int8ToFloat;
    103. auto dequantizationParam = new MNN::QuantizedFloatParamT;
    104. dequantizationOp->main.value = dequantizationParam;
    105. dequantizationParam->tensorScale = _scales[_tensorMap[index]];
    106. dequantizationOp->inputIndexes.push_back(index);
    107. dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
    108. auto originTensorName = _originaleModel->tensorName[index];
    109. _originaleModel->tensorName[index] = dequantizationOp->name;
    110. _originaleModel->tensorName.emplace_back(originTensorName);
    111. _originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
    112. }
    113. }