TVM编译模型的基本步骤

导入其他框架的模型
支持的主要框架：Tensorflow, PyTorch, ONNX
将该模型转换成Relay（TVM的高层级IR）.
Relay支持特性：

传统数据流式的表示
函数式语言风格的表示
两种风格的混合

在此步骤中，Relay进行图层次的优化

将Relay转换成更细粒度的Tensor Expression(TE)
Relay使用FuseOps将模型划分成小的子图，在此过程中可以使用一些schedule原语进行优化，
如tiling, vectorization, parallelization, unrolling, and fusion
TOPI包含一些预定义的常用Operator。
搜索最佳调度策略（AutoTVM或AutoScheduler）
TVM提供了如下两种自动调优模块。

AutoTVM: 模板化的自动调优模块。

TOPI中提供了通用算子优化后的调度模板
AutoScheduler (a.k.a. Ansor): 无模板的自动调优模块，自动生成搜索空间

选择模型编译的最优配置。
自动调优会生成JSON格式的优化记录.
编译生成Tensor IR (TIR, TVM的低层级IR，相对于Relay）。
TVM支持的后端包括:

LLVM, 通过它可以生成llvm支持的所有硬件如x86, ARM.
特定编译器，如NVCC, NVIDIA的编译器.
通过BYOC(Bring Your Own Codegen)框架实现

编译生成机器代码。
TVM可以将模型编译成可链接的对象模块来通过轻量级的运行时来运行。它提供多种语言的支持。
TVM也支持将模型和运行时统一打包。

Relay的使用

Relay是TVM中定义的High-level IR。
通过TVM中Relay模块，我们可以转换其他框架模型，并生成对应后端代码。

import numpy as np
from tvm import relay
from tvm.relay import testing
import tvm
from tvm import te
from tvm.contrib import graph_executor
import tvm.testing
######################################################################
# (1) 导入模型：
#     这里我们使用了预编译的resnet_18
batch_size = 1
num_class = 1000
image_shape = (3, 224, 224)
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)
mod, params = relay.testing.resnet.get_workload(
    num_layers=18, batch_size=batch_size, image_shape=image_shape
)
# set show_meta_data=True if you want to show meta data
print(mod.astext(show_meta_data=False))
######################################################################
# (2) 编译
# ----------- 
# - 优化等级范围 (0 to 3). 
# - 优化passes: operator fusion, pre-computation, layout transformation...
# Relay进行graph-level优化， TVM进行tensor-level优化
#
opt_level = 3
target = tvm.target.Target(target="llvm", host="llvm") #tvm.target.cuda()
with tvm.transform.PassContext(opt_level=opt_level):
    lib = relay.build(mod, target, params=params)
#####################################################################
#（3） 运行
dev = tvm.device(target.kind.name, 0) #tvm.cuda()
data = np.random.uniform(-1, 1, size=data_shape).astype("float32") 
module = graph_executor.GraphModule(lib["default"](dev)) 
module.set_input("data", data) 
module.run() 
out = module.get_output(0, tvm.nd.empty(out_shape)).numpy()
print(out.flatten()[0:10])
######################################################################
# （4） 保存并加载模型
from tvm.contrib import utils
temp = utils.tempdir()
path_lib = temp.relpath("deploy_lib.tar")
lib.export_library(path_lib)
print(temp.listdir())
####################################################
loaded_lib = tvm.runtime.load_module(path_lib)
input_data = tvm.nd.array(data)
module = graph_executor.GraphModule(loaded_lib["default"](dev))
module.run(data=input_data)
out_deploy = module.get_output(0).numpy()
print(out_deploy.flatten()[0:10])
# 比较结果
tvm.testing.assert_allclose(out_deploy, out, atol=1e-5)

TOPI的使用

TOPI 提供了很多实用算子和针对特性后端的调度器。


""" 
TVM Operator Inventory (TOPI)提供了numpy风格的高级抽象算子. 
"""
from __future__ import absolute_import, print_function
import tvm
import tvm.testing
from tvm import te
from tvm import topi
import numpy as np
######################################################################
# 基础实例：按行求和(等价numpy代码:`B = numpy.sum(A, axis=1)`) 
#
n = te.var("n")
m = te.var("m")
A = te.placeholder((n, m), name="A")
k = te.reduce_axis((0, m), "k")
B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
s = te.create_schedule(B.op)
######################################################################
# 查看生成的IR
print(tvm.lower(s, [A], simple_mode=True))
######################################################################
# 使用预定义topi方法
C = topi.sum(A, axis=1)
ts = te.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))
######################################################################
# Numpy-style 广播操作
x, y = 100, 10
a = te.placeholder((x, y, y), name="a")
b = te.placeholder((y, y), name="b")
c = a + b  # same as topi.broadcast_add
d = a * b  # same as topi.broadcast_mul
######################################################################
# Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`.
######################################################################
# TOPI定义了针对不同平台的调度器，如CUDA，x86... 
#
e = topi.elemwise_sum([c, d])
f = e / 2.0
g = topi.sum(f)
with tvm.target.Target(target="llvm", host="llvm"):
    # sg = topi.cuda.schedule_reduce(g) 
    sg = topi.x86.schedule_reduce(g)
    print(tvm.lower(sg, [a, b], simple_mode=True))
######################################################################
# As you can see, scheduled stages of computation have been accumulated and we can examine them by
print(sg.stages)
######################################################################
# 和numpy运算结果做比较
func = tvm.build(sg, [a, b, g], "llvm")
dev = tvm.device("llvm", 0)
a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
a_nd = tvm.nd.array(a_np, dev)
b_nd = tvm.nd.array(b_np, dev)
g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
func(a_nd, b_nd, g_nd)
tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5)
######################################################################
# TOPI 提供了常用的神经网络算子，如softmax，conv
######################################################################
# NOTE: TOPI的实现因后端而异，目标平台和调度器必须一致 
tarray = te.placeholder((512, 512), name="tarray")
softmax_topi = topi.nn.softmax(tarray)
with tvm.target.Target(target="llvm", host="llvm"):
    # sst = topi.cuda.schedule_softmax(softmax_topi)
    sst = topi.x86.schedule_softmax(softmax_topi) 
    print(tvm.lower(sst, [tarray], simple_mode=True))
data = te.placeholder((1, 3, 224, 224))
kernel = te.placeholder((10, 3, 5, 5))
with tvm.target.Target(target="llvm", host="llvm"): 
    conv = topi.nn.conv2d_nchw(data, kernel, 1, 2, 1)
    out = topi.nn.relu(conv)
    # sconv = topi.cuda.schedule_conv2d_nchw([out])
    sconv = topi.x86.schedule_conv2d_nchw([out]) 
    print(tvm.lower(sconv, [data, kernel], simple_mode=True))

模型的编译、运行和优化

"""
使用TVM的python API来进行模型的编译、运行和优化
"""
import onnx
from tvm.contrib.download import download_testdata
from PIL import Image
import numpy as np
import tvm.relay as relay
import tvm
from tvm.contrib import graph_executor
###############################################################################
# 获取一个预训练的模型（使用了一个ONNX格式的模型）和测试图片
model_url = "".join(
    [
        "https://github.com/onnx/models/raw/",
        "master/vision/classification/resnet/model/",
        "resnet50-v2-7.onnx",
    ]
)
model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
onnx_model = onnx.load(model_path)
img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
# Resize it to 224x224
resized_image = Image.open(img_path).resize((224, 224))
img_data = np.asarray(resized_image).astype("float32")
# Our input image is in HWC layout while ONNX expects CHW input, so convert the array
img_data = np.transpose(img_data, (2, 0, 1))
# Normalize according to the ImageNet input specification
imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev
# Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
img_data = np.expand_dims(norm_img_data, axis=0)
###############################################################################
# 使用Relay来编译模型
target = "llvm" 
input_name = "data"
shape_dict = {input_name: img_data.shape}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)
dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))
######################################################################
# 使用TVM Runtime运行模型
dtype = "float32"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
#～～～～～～～～～～～
# 评估基本的性能数据
import timeit
timing_number = 10
timing_repeat = 10
unoptimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
unoptimized = {
    "mean": np.mean(unoptimized),
    "median": np.median(unoptimized),
    "std": np.std(unoptimized),
}
print(unoptimized)
#～～～～～～～～～～
# 后处理 
from scipy.special import softmax
# Download a list of labels
labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
labels_path = download_testdata(labels_url, "synset.txt", module="data")
with open(labels_path, "r") as f:
    labels = [l.rstrip() for l in f]
# Open the output and read the output tensor
scores = softmax(tvm_output)
scores = np.squeeze(scores)
ranks = np.argsort(scores)[::-1]
for rank in ranks[0:5]:
    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
#～～～～～～～～～～～
# 输出结果
#
#     # class='n02123045 tabby, tabby cat' with probability=0.610553
#     # class='n02123159 tiger cat' with probability=0.367179
#     # class='n02124075 Egyptian cat' with probability=0.019365
#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
#     # class='n04040759 radiator' with probability=0.000261
################################################################################
# 使用autotvm来优化模型
#
import tvm.auto_scheduler as auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm
#～～～～～～～～～～～
# 基本配置参数
number = 10 # 尝试的不同优化配置的数目
repeat = 1  # 每个配置重复测量的次数
min_repeat_ms = 0  # 最小运行时间，影响GPU精度调优，CPU设置为0
timeout = 10  # 超时时间
# create a TVM runner
runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=timeout,
    min_repeat_ms=min_repeat_ms,
    enable_cpu_cache_flush=True,
)
#～～～～～～～～～～～～
# 优化选项
# tuner: 这里使用xgboost作为优化算法
# trials：对于产品级的模型，trials推荐值为CPU 1500, GPU 3000-4000. 这个是模型和处理器相关的。
#         需要多花时间实验。这里作为简单测试，仅设置为10
# Tearly_stopping：尝试最小次数
# measure_option: 构建运行选项 
# tuning_records： 输出的优化记录文件
tuning_option = {
    "tuner": "xgb",
    "trials": 10,
    "early_stopping": 100,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
    ),
    "tuning_records": "resnet-50-v2-autotuning.json",
}
# begin by extracting the taks from the onnx model
tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)
# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
    tuner_obj = XGBTuner(task, loss_type="rank")
    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )
#～～～～～～～ 
# 输出实例：
#   # [Task  1/24]  Current/Best:   10.71/  21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done.
#   # [Task  1/24]  Current/Best:    9.32/  24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done.
#   # [Task  2/24]  Current/Best:   22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done.
#   # [Task  3/24]  Current/Best:   32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done.
#   ....
#   # [Task 24/24]  Current/Best:   25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done.
################################################################################
# 编译优化后的模型
with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target=target, params=params)
dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))
#～～～～～～～～～
# 校验优化模型的结果
dtype = "float32"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
scores = softmax(tvm_output)
scores = np.squeeze(scores)
ranks = np.argsort(scores)[::-1]
for rank in ranks[0:5]:
    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
#～～～～～～～～～
# 评估性能
import timeit
timing_number = 10
timing_repeat = 10
optimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}
print("optimized: %s" % (optimized))
print("unoptimized: %s" % (unoptimized))

使用Ansor优化模型

""" 
使用TVM的Auto Scheduling feature（Ansor）搜索优化解
"""
import os
import numpy as np
import tvm
from tvm import te, auto_scheduler
################################################################################
# 定义矩阵乘计算
# ----------------------------------
# 1 函数使用 `auto_sceduler` 装饰器
# 2 函数返回input/output tensors列表. Ansor据此构建计算图。
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def matmul_add(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)
    C = te.placeholder((N, M), name="C", dtype=dtype)
    k = te.reduce_axis((0, L), name="k")
    matmul = te.compute(
        (N, M),
        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
        name="matmul",
        attrs={"layout_free_placeholders": [B]},  # enable automatic layout transform for tensor B
    )
    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
    return [A, B, C, out]
################################################################################
# 创建搜索任务
# ----------------------
# 为了充分利用硬件特性，Target越具体越好 
#   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
#   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
target = tvm.target.Target("llvm")
N = L = M = 1024
task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
#～～～～～～～～～～～～～
# 设置Ansor的参数
# ---------------------------------
# `num_measure_trials`： 尝试次数，通常可能1000次才能收敛，为测试方便，设为10 
log_file = "matmul.json"
tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=10,
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    verbose=2,
)
################################################################################
# 执行自动搜索任务  
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)
################################################################################
# 校验优化后的结果
# --------------------------------- 
# auto-scheduler 执行的优化包括 multi-level tiling, layout transformation, parallelization, vectorization, unrolling, and operator fusion.
print("Lowered TIR:")
print(tvm.lower(sch, args, simple_mode=True))
#～～～～～～～～～～～
# 检验正确性，评估性能
func = tvm.build(sch, args, target)
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.random.uniform(size=(N, M)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np
dev = tvm.cpu()
a_tvm = tvm.nd.array(a_np, device=dev)
b_tvm = tvm.nd.array(b_np, device=dev)
c_tvm = tvm.nd.array(c_np, device=dev)
out_tvm = tvm.nd.empty(out_np.shape, device=dev)
func(a_tvm, b_tvm, c_tvm, out_tvm)
# Check results
np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
# Evaluate execution time.
evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
print(
    "Execution time of this operator: %.3f ms"
    % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
)
# 输出等价的Python schedule API结果
print("[~~~~~~Equivalent python schedule:~~~~~]")
print(task.print_best(log_file))
################################################################################
# 恢复搜索
def resume_search(task, log_file):
    print("[~~~~Resume search:~~~~~~~]")
    cost_model = auto_scheduler.XGBModel()
    cost_model.update_from_file(log_file)
    search_policy = auto_scheduler.SketchPolicy(
        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
    )
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
    )
    task.tune(tune_option, search_policy=search_policy)
resume_search(task, log_file)

Machine Learning

TVM notes

TVM编译模型的基本步骤

Relay的使用

TOPI的使用

模型的编译、运行和优化