MNN量化 - Calibration - 《MNN源码学习》

离线量化方法采用了nvidia的方案，采用了对称量化方案。对激活值量化，利用kl散度选择最优的scale值；对参数选择了max_abs方法进行逐通道量化，每一个卷积核有不同的scale值。

Calibration::Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
    : _originaleModel(model) {
    // when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
    int channles = 3;
    rapidjson::Document document;
    {
        std::ifstream fileNames(configPath.c_str());
        std::ostringstream output;
        output << fileNames.rdbuf();
        auto outputStr = output.str();
        document.Parse(outputStr.c_str());
        if (document.HasParseError()) {
            MNN_ERROR("Invalid json\n");
            return;
        }
    }
    auto picObj = document.GetObject();
    ImageProcess::Config config;
    config.filterType = BILINEAR;
    config.destFormat = BGR;
    {
        if (picObj.HasMember("format")) {
            auto format = picObj["format"].GetString();
            static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}};
            if (formatMap.find(format) != formatMap.end()) {
                config.destFormat = formatMap.find(format)->second;
            }
        }
    }
    if (config.destFormat == GRAY) {
        channles = 1;
    }
    config.sourceFormat = RGBA;
    std::string imagePath;
    _imageNum = 0;
    {
        if (picObj.HasMember("mean")) {
            auto mean = picObj["mean"].GetArray();
            int cur   = 0;
            for (auto iter = mean.begin(); iter != mean.end(); iter++) {
                config.mean[cur++] = iter->GetFloat();
            }
        }
        if (picObj.HasMember("normal")) {
            auto normal = picObj["normal"].GetArray();
            int cur     = 0;
            for (auto iter = normal.begin(); iter != normal.end(); iter++) {
                config.normal[cur++] = iter->GetFloat();
            }
        }
        if (picObj.HasMember("width")) {
            _width = picObj["width"].GetInt();
        }
        if (picObj.HasMember("height")) {
            _height = picObj["height"].GetInt();
        }
        if (picObj.HasMember("path")) {
            imagePath = picObj["path"].GetString();
        }
        if (picObj.HasMember("used_image_num")) {
            _imageNum = picObj["used_image_num"].GetInt();
        }
        if (picObj.HasMember("feature_quantize_method")) {
            std::string method = picObj["feature_quantize_method"].GetString();
            if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
                _featureQuantizeMethod = method;
            } else {
                MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
                return;
            }
        }
        if (picObj.HasMember("weight_quantize_method")) {
            std::string method = picObj["weight_quantize_method"].GetString();
            if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
                _weightQuantizeMethod = method;
            } else {
                MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
                return;
            }
        }
        DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
        DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
    }
    std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
    _process = process;
    // read images file names
    Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);
    _initMNNSession(modelBuffer, bufferSize, channles);
    _initMaps();
}

Calibration量化流程如下所示：

读取量化配置文件，设置好相应量化参数
设置图片预处理，并读取图片
初始化好用于统计tensor信息的推理模型

初始化好统计不同tensor的TensorStatistic

初始化Session用于统计Tensor信息
void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
  _interpreter.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
  MNN::ScheduleConfig config;
  _session     = _interpreter->createSession(config);
  _inputTensor = _interpreter->getSessionInput(_session, NULL);
  _inputTensorDims.resize(4);
  auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
  DCHECK(4 == _inputTensor->dimensions()) << "Only support 4 dimensions input";
  if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
      _inputTensorDims[0] = 1;
      _inputTensorDims[1] = _height;
      _inputTensorDims[2] = _width;
      _inputTensorDims[3] = channels;
  } else if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NC4HW4) {
      _inputTensorDims[0] = 1;
      _inputTensorDims[1] = channels;
      _inputTensorDims[2] = _height;
      _inputTensorDims[3] = _width;
  } else {
      DLOG(ERROR) << "Input Data Format ERROR!";
  }
  if (_featureQuantizeMethod == "KL") {
      _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
      _interpreter->resizeSession(_session);
  } else if (_featureQuantizeMethod == "ADMM") {
      DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
      _inputTensorDims[0] = _imageNum;
      _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
      _interpreter->resizeSession(_session);
  }
  _interpreter->releaseModel();
}

void Calibration::_initMaps() {
  _featureInfo.clear();
  _opInfo.clear(); // 记录算子对应的输入和输出tensor的指针
  _tensorMap.clear();
  // std::set<std::string> Helper::gNeedFeatureOp = {"Convolution", "ConvolutionDepthwise", "Eltwise", "Pooling"};
  // 量化op包括卷积、深度卷积、EltwiseAdd和Pooling
  // run mnn once, initialize featureMap, opInfo map
  // 在每个op算子计算前调用before
  MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
      _opInfo[info->name()].first = nTensors;
      // 如果该算子为量化op，将所有输入tensor加入会进行量化信息统计的map中，且利用TensorStatistic，对每个tensor进行统计。
      if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
          for (auto t : nTensors) {
              if (_featureInfo.find(t) == _featureInfo.end()) {
                  _featureInfo[t] = std::shared_ptr<TensorStatistic>(
                      new TensorStatistic(t, _featureQuantizeMethod, info->name() + "__input"));
              }
          }
      }
      return false;
  };
  // 在每个op算子计算后调用end
  MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
                                             const MNN::OperatorInfo* info) {
      _opInfo[info->name()].second = nTensors;
      // 如果该算子为量化op，将所有输出tensor加入会进行量化信息统计的map中，且利用TensorStatistic，对每个tensor进行统计。
      if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
          for (auto t : nTensors) {
              if (_featureInfo.find(t) == _featureInfo.end()) {
                  _featureInfo[t] =
                      std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, info->name()));
              }
          }
      }
      return true;
  };
  // 进行推理同时设置两个回调函数，记录待统计的tensor
  _interpreter->runSessionWithCallBackInfo(_session, before, after);
  // 记录tensor索引到tensor指针的映射
  for (auto& op : _originaleModel->oplists) {
      if (_opInfo.find(op->name) == _opInfo.end()) {
          continue;
      }
      for (int i = 0; i < op->inputIndexes.size(); ++i) {
          _tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
      }
      for (int i = 0; i < op->outputIndexes.size(); ++i) {
          _tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
      }
  }
  if (_featureQuantizeMethod == "KL") {
      // 输入tensor的统计方法不适用KL
      // set the tensor-statistic method of input tensor as THRESHOLD_MAX
      auto inputTensorStatistic = _featureInfo.find(_inputTensor);
      if (inputTensorStatistic != _featureInfo.end()) {
          inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
      }
  }
}

void Calibration::runQuantizeModel() {
  // 计算激活值的scale值
  if (_featureQuantizeMethod == "KL") {
      _computeFeatureScaleKL();
  } else if (_featureQuantizeMethod == "ADMM") {
      _computeFeatureScaleADMM();
  }
  // 统计weight的信息统计，创建量化op，生成网络
  _updateScale();
  // 对于不支持量化的算子，插入反量化算子转化为float，对输出也插入反量化算子
  _insertDequantize();
}

void Calibration::_computeFeatureScaleKL() {
  // 计算特征图范围
  _computeFeatureMapsRange();
  // 计算特征图分布
  _collectFeatureMapsDistribution();
  _scales.clear();
  for (auto& iter : _featureInfo) {
      AUTOTIME;
      // 生成tensor的scale值
      _scales[iter.first] = iter.second->finishAndCompute();
  }
  //_featureInfo.clear();//No need now
}

```cpp void resetUpdatedRangeFlags() {

  mUpdatedRangeFlags = false;

}

void TensorStatistic::updateRange() { if (mUpdatedRangeFlags) { return; } mUpdatedRangeFlags = true; mOriginTensor->copyToHostTensor(mHostTensor.get()); int batch = mHostTensor->batch(); int channel = mHostTensor->channel(); int width = mHostTensor->width(); int height = mHostTensor->height(); auto area = width * height;

for (int n = 0; n < batch; ++n)
{
    auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
    for (int c = 0; c < channel; ++c)
    {
        int cIndex = c;
        // mMergeChannel默认为true，即对激活值不采用逐通道量化，
        if (mMergeChannel)
        {
            cIndex = 0;
        }
        // 统计最大最小值
        auto minValue = mRangePerChannel[cIndex].first;
        auto maxValue = mRangePerChannel[cIndex].second;
        auto dataChannel = dataBatch + c * mHostTensor->stride(1);
        for (int v = 0; v < area; ++v)
        {
            minValue = std::min(minValue, dataChannel[v]);
            maxValue = std::max(maxValue, dataChannel[v]);
        }
        mRangePerChannel[cIndex].first = minValue;
        mRangePerChannel[cIndex].second = maxValue;
    }
}

}

void Calibration::_computeFeatureMapsRange() { // feed input data according to input images int count = 0; for (const auto& img : _imgaes) { for (auto& iter : _featureInfo) { // 由于输入输出tensor可能会重叠，出现重复统计的情况 // 设置flag表示是否range被统计过 iter.second->resetUpdatedRangeFlags(); } count++; // 图片预处理 Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);

    MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
                                             const MNN::OperatorInfo* info) {
        for (auto t : nTensors) {
            if (_featureInfo.find(t) != _featureInfo.end()) {
                // 统计输入tensor的range
                _featureInfo[t]->updateRange();
            }
        }
        return true;
    };
    MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
                                            const MNN::OperatorInfo* info) {
        for (auto t : nTensors) {
            if (_featureInfo.find(t) != _featureInfo.end()) {
                // 统计输出tensor的range
                _featureInfo[t]->updateRange();
            }
        }
        return true;
    };
    _interpreter->runSessionWithCallBackInfo(_session, before, after);
    MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
    fflush(stdout);
}
MNN_PRINT("\n");

}

```cpp
void TensorStatistic::resetDistribution()
{
    for (int i = 0; i < mIntervals.size(); ++i)
    {
        int cIndex = i;
        // 类似之前
        if (mMergeChannel)
        {
            cIndex = 0;
        }
        // 统计最大绝对值，nvidia方案针对relu后的激活值进行量化，因此所有tensor值都为正，此处应该也假设了所有tensor生成的值大于0
        // 否则此处代码逻辑有问题
        auto maxValue = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
        mValidChannel[cIndex] = maxValue > 0.00001f;
        mIntervals[cIndex] = 0.0f;
        if (mValidChannel[cIndex])
        {
            // 预先计算2048 / max
            // 之后乘tensor的值可以得到对应的bin的位置
            mIntervals[cIndex] = (float)mBinNumber / maxValue;
        }
    }
    for (auto &c : mDistribution)
    {
        // 防止出现某个bin没有元素，平滑操作
        std::fill(c.begin(), c.end(), 1.0e-07);
    }
    // MNN_PRINT("==> %s max: %f\n", mName.c_str(),std::max(fabsf(mRangePerChannel[0].second),
    // fabsf(mRangePerChannel[0].first)));
}
void TensorStatistic::updateDistribution()
{
    if (mUpdatedDistributionFlag)
    {
        return;
    }
    mUpdatedDistributionFlag = true;
    mOriginTensor->copyToHostTensor(mHostTensor.get());
    int batch = mHostTensor->batch();
    int channel = mHostTensor->channel();
    int width = mHostTensor->width();
    int height = mHostTensor->height();
    auto area = width * height;
    for (int n = 0; n < batch; ++n)
    {
        auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
        for (int c = 0; c < channel; ++c)
        {
            int cIndex = c;
            if (mMergeChannel)
            {
                cIndex = 0;
            }
            if (!mValidChannel[cIndex])
            {
                continue;
            }
            auto multi = mIntervals[cIndex];
            auto target = mDistribution[cIndex].data();
            auto dataChannel = dataBatch + c * mHostTensor->stride(1);
            for (int v = 0; v < area; ++v)
            {
                auto data = dataChannel[v];
                if (data == 0)
                {
                    continue;
                }
                // 生成bin的index并将对应bin的值加1
                int index = static_cast<int>(fabs(data) * multi);
                index = std::min(index, mBinNumber - 1);
                target[index] += 1.0f;
            }
        }
    }
}
void Calibration::_collectFeatureMapsDistribution() {
    for (auto& iter : _featureInfo) {
        // 将tensor分布装在2048个bin中作为原始激活值的分布，对每个tensor分布进行初始化
        iter.second->resetDistribution();
    }
    // feed input data according to input images
    // 对输入输出tensor的分布进行更新
    MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        for (auto t : nTensors) {
            if (_featureInfo.find(t) != _featureInfo.end()) {
                _featureInfo[t]->updateDistribution();
            }
        }
        return true;
    };
    MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        for (auto t : nTensors) {
            if (_featureInfo.find(t) != _featureInfo.end()) {
                _featureInfo[t]->updateDistribution();
            }
        }
        return true;
    };
    int count = 0;
    for (const auto& img : _imgaes) {
        count++;
        for (auto& iter : _featureInfo) {
            iter.second->resetUpdatedDistributionFlag();
        }
        Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
        _interpreter->runSessionWithCallBackInfo(_session, before, after);
        MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
        fflush(stdout);
    }
    MNN_PRINT("\n");
}

// 每个tensor生成对应的scale值
std::vector<float> TensorStatistic::finishAndCompute()
{
    std::vector<float> scaleValue(mDistribution.size(), 0.0f);
    // 默认为true
    if (mMergeChannel)
    {
        if (!mValidChannel[0])
        {
            return scaleValue;
        }
        float sum = 0.0f;
        auto &distribution = mDistribution[0];
        // 生成概率分布
        std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
        std::for_each(distribution.begin(), distribution.end(), [sum](float &n) { n /= sum; });
        // 对分布计算阈值，threshold表示bin的位置
        auto threshold = _computeThreshold(distribution);
        // 根据threshold计算scale值
        auto scale = ((float)threshold + 0.5) / mIntervals[0] / 127.0f;
        // MNN_PRINT("==> %s == %d, %f, %f\n", mName.c_str(),threshold, 1.0f / mIntervals[0], scale * 127.0f);
        std::fill(scaleValue.begin(), scaleValue.end(), scale);
        return scaleValue;
    }
    for (int c = 0; c < mDistribution.size(); ++c)
    {
        if (!mValidChannel[c])
        {
            continue;
        }
        float sum = 0.0f;
        auto &distribution = mDistribution[c];
        std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });
        std::for_each(distribution.begin(), distribution.end(), [sum](float &n) { n /= sum; });
        auto threshold = _computeThreshold(distribution);
        scaleValue[c] = ((float)threshold + 0.5) / mIntervals[c] / 127.0;
    }
    return scaleValue;
}

int TensorStatistic::_computeThreshold(const std::vector<float> &distribution)
{
    // 128即int8的一半，全为正数
    const int targetBinNums = 128;
    int threshold = targetBinNums;
    if (mThresholdMethod == THRESHOLD_KL)
    {
        float minKLDivergence = 10000.0f;
        float afterThresholdSum = 0.0f;
        std::for_each(distribution.begin() + targetBinNums, distribution.end(),
                      [&](float n) { afterThresholdSum += n; });
        for (int i = targetBinNums; i < mBinNumber; ++i)
        {
            std::vector<float> quantizedDistribution(targetBinNums);
            std::vector<float> candidateDistribution(i);
            std::vector<float> expandedDistribution(i);
            std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
            // 计算P分布
            candidateDistribution[i - 1] += afterThresholdSum;
            afterThresholdSum -= distribution[i];
            const float binInterval = (float)i / (float)targetBinNums;
            // merge i bins to target bins
            // 量化i个bins到int8的128个bins中
            for (int j = 0; j < targetBinNums; ++j)
            {
                const float start = j * binInterval;
                const float end = start + binInterval;
                const int leftUpper = static_cast<int>(std::ceil(start));
                if (leftUpper > start)
                {
                    // 假设start 为 2.4，则添加0.6个左界分布到其中
                    const float leftScale = leftUpper - start;
                    quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
                }
                const int rightLower = static_cast<int>(std::floor(end));
                if (rightLower < end)
                {
                    // 假设start 为 2.4，则添加0.4个右界分布到其中
                    const float rightScale = end - rightLower;
                    quantizedDistribution[j] += rightScale * distribution[rightLower];
                }
                std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
                              [&](float n) { quantizedDistribution[j] += n; });
            }
            // expand target bins to i bins
            // 反量化int8到i个bins中
            for (int j = 0; j < targetBinNums; ++j)
            {
                const float start = j * binInterval;
                const float end = start + binInterval;
                float count = 0;
                const int leftUpper = static_cast<int>(std::ceil(start));
                float leftScale = 0.0f;
                if (leftUpper > start)
                {
                     // 与之前同理，计算每个int8bin中是否包含左界的一部分，不统计原分布为0的bin
                    leftScale = leftUpper - start;
                    if (distribution[leftUpper - 1] != 0)
                    {
                        count += leftScale;
                    }
                }
                const int rightLower = static_cast<int>(std::floor(end));
                float rightScale = 0.0f;
                if (rightLower < end)
                {
                    // 计算每个int8bin中是否包含右界的一部分，不统计原分布为0的bin
                    rightScale = end - rightLower;
                    if (distribution[rightLower] != 0)
                    {
                        count += rightScale;
                    }
                }
                std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
                    if (n != 0)
                    {
                        count += 1;
                    }
                });
                if (count == 0)
                {
                    continue;
                }
                // 求均值
                const float toExpandValue = quantizedDistribution[j] / count;
                if (leftUpper > start && distribution[leftUpper - 1] != 0)
                {
                    // 左界赋值
                    expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
                }
                if (rightLower < end && distribution[rightLower] != 0)
                {
                    // 右界赋值
                    expandedDistribution[rightLower] += toExpandValue * rightScale;
                }
                for (int k = leftUpper; k < rightLower; ++k)
                {
                    // 中间段不为0的bin赋值
                    if (distribution[k] != 0)
                    {
                        expandedDistribution[k] += toExpandValue;
                    }
                }
            }
            // 计算kl散度
            const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
            // std::cout << "=====> KL: " << i << " ==> " << curKL << std::endl;
            if (curKL < minKLDivergence)
            {
                minKLDivergence = curKL;
                threshold = i;
            }
        }
    }
    else if (mThresholdMethod == THRESHOLD_MAX)
    {
        threshold = mBinNumber - 1;
    }
    else
    {
        // TODO, support other method
        MNN_ASSERT(false);
    }
    return threshold;
}

下图为上述代码的伪代码：
P为原始分布将阈值外的分布全部加到最右侧，如果该阈值不错，则P分布与i个bin之前的分布之间KL散度相差不大。
Q为i个bin之前的分布进行量化再反量化后生成的分布，计算P、Q分布的KL散度相当于模拟了量化误差与阈值选取误差的和。

void Calibration::_updateScale() {
    for (const auto& op : _originaleModel->oplists) {
        const auto opType = op->type;
        // 如果不为量化op，忽略
        if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
            opType != MNN::OpType_Eltwise) {
            continue;
        }
        auto tensorsPair = _opInfo.find(op->name);
        if (tensorsPair == _opInfo.end()) {
            MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
        }
        if (opType == MNN::OpType_Eltwise) {
            auto param = op->main.AsEltwise();
            // Now only support AddInt8 只支持EltwiseType_SUM
            if (param->type != MNN::EltwiseType_SUM) {
                continue;
            }
            // 记录scale值
            const auto& inputScale0   = _scales[tensorsPair->second.first[0]];
            const auto& inputScale1   = _scales[tensorsPair->second.first[1]];
            const auto& outputScale   = _scales[tensorsPair->second.second[0]];
            const int outputScaleSize = outputScale.size();
            std::vector<float> outputInvertScale(outputScaleSize);
            Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
            op->type = MNN::OpType_EltwiseInt8;
            op->main.Reset();
            op->main.type = MNN::OpParameter_EltwiseInt8;
            auto eltwiseInt8Param         = new MNN::EltwiseInt8T;
            auto input0ScaleParam         = new MNN::QuantizedFloatParamT;
            auto input1ScaleParam         = new MNN::QuantizedFloatParamT;
            auto outputScaleParam         = new MNN::QuantizedFloatParamT;
            input0ScaleParam->tensorScale = inputScale0;
            input1ScaleParam->tensorScale = inputScale1;
            outputScaleParam->tensorScale = outputInvertScale;
            eltwiseInt8Param->inputQuan0  = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
            eltwiseInt8Param->inputQuan1  = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
            eltwiseInt8Param->outputQuan  = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
            op->main.value                = eltwiseInt8Param;
            continue;
        }
        // below is Conv/DepthwiseConv
        const auto& inputScale  = _scales[tensorsPair->second.first[0]];
        const auto& outputScale = _scales[tensorsPair->second.second[0]];
        auto param                = op->main.AsConvolution2D();
        param->common->inputCount = tensorsPair->second.first[0]->channel();
        const int channles        = param->common->outputCount;
        const int weightSize      = param->weight.size();
        param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);
        auto& quantizedParam = param->symmetricQuan;
        quantizedParam->scale.resize(channles);
        quantizedParam->weight.resize(weightSize);
        quantizedParam->bias.resize(channles);
        // 针对两种卷积采用不同的量化方式
        if (opType == MNN::OpType_Convolution) {
            QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
                                   quantizedParam->weight.data(), quantizedParam->bias.data(),
                                   quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
            op->type = MNN::OpType_ConvInt8;
        } else if (opType == MNN::OpType_ConvolutionDepthwise) {
            QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
                                  quantizedParam->weight.data(), quantizedParam->bias.data(),
                                  quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
            op->type = MNN::OpType_DepthwiseConvInt8;
        }
        // 取消relu6
        if (param->common->relu6) {
            param->common->relu  = true;
            param->common->relu6 = false;
        }
        param->weight.clear();
        param->bias.clear();
    }
}

void Calibration::_insertDequantize() {
    // Search All Int Tensors
    std::set<int> int8Tensors; // 统计被量化的tensor
    std::set<int> int8Outputs; // 统计被量化的且为最终输出的tensor
    for (auto& op : _originaleModel->oplists) {
        if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {
            for (auto index : op->inputIndexes) {
                int8Tensors.insert(index);
            }
            for (auto index : op->outputIndexes) {
                int8Tensors.insert(index);
                int8Outputs.insert(index);
            }
        }
    }
    for (auto& op : _originaleModel->oplists) {
        for (auto index : op->inputIndexes) {
            auto iter = int8Outputs.find(index);
            if (iter != int8Outputs.end()) {
                int8Outputs.erase(iter);
            }
        }
    }
    // Insert Convert For Not Support Int8 Ops
    // 对不支持量化的op进行处理
    for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
        auto op           = iter->get();
        const auto opType = op->type;
        const auto name   = op->name;
        // check whether is output op
        // if Yes, insert dequantization op after this op
        if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
            // this is quantized op
            iter++;
            continue;
        }
        auto& inputIndexes  = op->inputIndexes;
        const int inputSize = inputIndexes.size();
        // insert dequantization op before this op
        // 如果该算子的输入为int8，插入反量化op
        for (int i = 0; i < inputSize; ++i) {
            const auto curInputIndex = inputIndexes[i];
            if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {
                continue;
            }
            auto input        = _tensorMap[curInputIndex];
            auto inputOpScale = _scales[input];
            // construct new op
            auto dequantizationOp       = new MNN::OpT;
            dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
            dequantizationOp->name      = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
            dequantizationOp->type           = MNN::OpType_Int8ToFloat;
            auto dequantizationParam         = new MNN::QuantizedFloatParamT;
            dequantizationOp->main.value     = dequantizationParam;
            dequantizationParam->tensorScale = inputOpScale;
            dequantizationOp->inputIndexes.push_back(curInputIndex);
            dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
            _originaleModel->tensorName.push_back(dequantizationOp->name);
            // reset current op's input index at i
            inputIndexes[i] = dequantizationOp->outputIndexes[0];
            iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
            iter++;
        }
        iter++;
        // LOG(INFO) << "insert quantization op after this op if neccessary";
        // insert quantization op after this op if neccessary
        // 如果该算子输出为int8，则插入量化op
        for (int i = 0; i < op->outputIndexes.size(); ++i) {
            const auto outputIndex = op->outputIndexes[i];
            if (int8Tensors.find(outputIndex) == int8Tensors.end()) {
                continue;
            }
            auto output   = _tensorMap[outputIndex];
            auto curScale = _scales[output];
            // construct one quantization op(FloatToInt8)
            auto quantizationOp        = new MNN::OpT;
            quantizationOp->main.type  = MNN::OpParameter_QuantizedFloatParam;
            quantizationOp->name       = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
            quantizationOp->type       = MNN::OpType_FloatToInt8;
            auto quantizationParam     = new MNN::QuantizedFloatParamT;
            quantizationOp->main.value = quantizationParam;
            const int channels = curScale.size();
            std::vector<float> quantizationScale(channels);
            Helper::invertData(quantizationScale.data(), curScale.data(), channels);
            quantizationParam->tensorScale = quantizationScale;
            quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
            quantizationOp->outputIndexes.push_back(outputIndex);
            _originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
            _originaleModel->tensorName[outputIndex] = quantizationOp->name;
            op->outputIndexes[i]                              = quantizationOp->inputIndexes[0];
            iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
            iter++;
        }
    }
    // Insert Turn float Op for output
    // 对输入算子之后插入反量化op
    for (auto index : int8Outputs) {
        // construct new op
        auto dequantizationOp       = new MNN::OpT;
        dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
        dequantizationOp->name      = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
        dequantizationOp->type           = MNN::OpType_Int8ToFloat;
        auto dequantizationParam         = new MNN::QuantizedFloatParamT;
        dequantizationOp->main.value     = dequantizationParam;
        dequantizationParam->tensorScale = _scales[_tensorMap[index]];
        dequantizationOp->inputIndexes.push_back(index);
        dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
        auto originTensorName              = _originaleModel->tensorName[index];
        _originaleModel->tensorName[index] = dequantizationOp->name;
        _originaleModel->tensorName.emplace_back(originTensorName);
        _originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
    }
}