

  1. 导入其他框架的模型
    支持的主要框架:Tensorflow, PyTorch, ONNX

  2. 将该模型转换成Relay(TVM的高层级IR).

  • 传统数据流式的表示
  • 函数式语言风格的表示
  • 两种风格的混合


  1. 将Relay转换成更细粒度的Tensor Expression(TE)
    如tiling, vectorization, parallelization, unrolling, and fusion

  2. 搜索最佳调度策略(AutoTVMAutoScheduler

  • AutoTVM: 模板化的自动调优模块。


  • AutoScheduler (a.k.a. Ansor): 无模板的自动调优模块,自动生成搜索空间

  1. 选择模型编译的最优配置。

  2. 编译生成Tensor IR (TIR, TVM的低层级IR,相对于Relay)。

  • LLVM, 通过它可以生成llvm支持的所有硬件如x86, ARM.
  • 特定编译器,如NVCC, NVIDIA的编译器.
  • 通过BYOC(Bring Your Own Codegen)框架实现
  1. 编译生成机器代码。


Relay是TVM中定义的High-level IR。

  1. import numpy as np
  2. from tvm import relay
  3. from tvm.relay import testing
  4. import tvm
  5. from tvm import te
  6. from tvm.contrib import graph_executor
  7. import tvm.testing
  8. ######################################################################
  9. # (1) 导入模型:
  10. # 这里我们使用了预编译的resnet_18
  11. batch_size = 1
  12. num_class = 1000
  13. image_shape = (3, 224, 224)
  14. data_shape = (batch_size,) + image_shape
  15. out_shape = (batch_size, num_class)
  16. mod, params = relay.testing.resnet.get_workload(
  17. num_layers=18, batch_size=batch_size, image_shape=image_shape
  18. )
  19. # set show_meta_data=True if you want to show meta data
  20. print(mod.astext(show_meta_data=False))
  21. ######################################################################
  22. # (2) 编译
  23. # -----------
  24. # - 优化等级范围 (0 to 3).
  25. # - 优化passes: operator fusion, pre-computation, layout transformation...
  26. # Relay进行graph-level优化, TVM进行tensor-level优化
  27. #
  28. opt_level = 3
  29. target ="llvm", host="llvm")
  30. with tvm.transform.PassContext(opt_level=opt_level):
  31. lib =, target, params=params)
  32. #####################################################################
  33. #(3) 运行
  34. dev = tvm.device(, 0) #tvm.cuda()
  35. data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
  36. module = graph_executor.GraphModule(lib["default"](dev))
  37. module.set_input("data", data)
  39. out = module.get_output(0, tvm.nd.empty(out_shape)).numpy()
  40. print(out.flatten()[0:10])
  41. ######################################################################
  42. # (4) 保存并加载模型
  43. from tvm.contrib import utils
  44. temp = utils.tempdir()
  45. path_lib = temp.relpath("deploy_lib.tar")
  46. lib.export_library(path_lib)
  47. print(temp.listdir())
  48. ####################################################
  49. loaded_lib = tvm.runtime.load_module(path_lib)
  50. input_data = tvm.nd.array(data)
  51. module = graph_executor.GraphModule(loaded_lib["default"](dev))
  53. out_deploy = module.get_output(0).numpy()
  54. print(out_deploy.flatten()[0:10])
  55. # 比较结果
  56. tvm.testing.assert_allclose(out_deploy, out, atol=1e-5)


TOPI 提供了很多实用算子和针对特性后端的调度器。

  1. """
  2. TVM Operator Inventory (TOPI)提供了numpy风格的高级抽象算子.
  3. """
  4. from __future__ import absolute_import, print_function
  5. import tvm
  6. import tvm.testing
  7. from tvm import te
  8. from tvm import topi
  9. import numpy as np
  10. ######################################################################
  11. # 基础实例:按行求和(等价numpy代码:`B = numpy.sum(A, axis=1)`)
  12. #
  13. n = te.var("n")
  14. m = te.var("m")
  15. A = te.placeholder((n, m), name="A")
  16. k = te.reduce_axis((0, m), "k")
  17. B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
  18. s = te.create_schedule(B.op)
  19. ######################################################################
  20. # 查看生成的IR
  21. print(tvm.lower(s, [A], simple_mode=True))
  22. ######################################################################
  23. # 使用预定义topi方法
  24. C = topi.sum(A, axis=1)
  25. ts = te.create_schedule(C.op)
  26. print(tvm.lower(ts, [A], simple_mode=True))
  27. ######################################################################
  28. # Numpy-style 广播操作
  29. x, y = 100, 10
  30. a = te.placeholder((x, y, y), name="a")
  31. b = te.placeholder((y, y), name="b")
  32. c = a + b # same as topi.broadcast_add
  33. d = a * b # same as topi.broadcast_mul
  34. ######################################################################
  35. # Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`.
  36. ######################################################################
  37. # TOPI定义了针对不同平台的调度器,如CUDA,x86...
  38. #
  39. e = topi.elemwise_sum([c, d])
  40. f = e / 2.0
  41. g = topi.sum(f)
  42. with"llvm", host="llvm"):
  43. # sg = topi.cuda.schedule_reduce(g)
  44. sg = topi.x86.schedule_reduce(g)
  45. print(tvm.lower(sg, [a, b], simple_mode=True))
  46. ######################################################################
  47. # As you can see, scheduled stages of computation have been accumulated and we can examine them by
  48. print(sg.stages)
  49. ######################################################################
  50. # 和numpy运算结果做比较
  51. func =, [a, b, g], "llvm")
  52. dev = tvm.device("llvm", 0)
  53. a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
  54. b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
  55. g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
  56. a_nd = tvm.nd.array(a_np, dev)
  57. b_nd = tvm.nd.array(b_np, dev)
  58. g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
  59. func(a_nd, b_nd, g_nd)
  60. tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5)
  61. ######################################################################
  62. # TOPI 提供了常用的神经网络算子,如softmax,conv
  63. ######################################################################
  64. # NOTE: TOPI的实现因后端而异,目标平台和调度器必须一致
  65. tarray = te.placeholder((512, 512), name="tarray")
  66. softmax_topi = topi.nn.softmax(tarray)
  67. with"llvm", host="llvm"):
  68. # sst = topi.cuda.schedule_softmax(softmax_topi)
  69. sst = topi.x86.schedule_softmax(softmax_topi)
  70. print(tvm.lower(sst, [tarray], simple_mode=True))
  71. data = te.placeholder((1, 3, 224, 224))
  72. kernel = te.placeholder((10, 3, 5, 5))
  73. with"llvm", host="llvm"):
  74. conv = topi.nn.conv2d_nchw(data, kernel, 1, 2, 1)
  75. out = topi.nn.relu(conv)
  76. # sconv = topi.cuda.schedule_conv2d_nchw([out])
  77. sconv = topi.x86.schedule_conv2d_nchw([out])
  78. print(tvm.lower(sconv, [data, kernel], simple_mode=True))


  1. """
  2. 使用TVM的python API来进行模型的编译、运行和优化
  3. """
  4. import onnx
  5. from import download_testdata
  6. from PIL import Image
  7. import numpy as np
  8. import tvm.relay as relay
  9. import tvm
  10. from tvm.contrib import graph_executor
  11. ###############################################################################
  12. # 获取一个预训练的模型(使用了一个ONNX格式的模型)和测试图片
  13. model_url = "".join(
  14. [
  15. "",
  16. "master/vision/classification/resnet/model/",
  17. "resnet50-v2-7.onnx",
  18. ]
  19. )
  20. model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
  21. onnx_model = onnx.load(model_path)
  22. img_url = ""
  23. img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
  24. # Resize it to 224x224
  25. resized_image =, 224))
  26. img_data = np.asarray(resized_image).astype("float32")
  27. # Our input image is in HWC layout while ONNX expects CHW input, so convert the array
  28. img_data = np.transpose(img_data, (2, 0, 1))
  29. # Normalize according to the ImageNet input specification
  30. imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
  31. imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
  32. norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev
  33. # Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
  34. img_data = np.expand_dims(norm_img_data, axis=0)
  35. ###############################################################################
  36. # 使用Relay来编译模型
  37. target = "llvm"
  38. input_name = "data"
  39. shape_dict = {input_name: img_data.shape}
  40. mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
  41. with tvm.transform.PassContext(opt_level=3):
  42. lib =, target=target, params=params)
  43. dev = tvm.device(str(target), 0)
  44. module = graph_executor.GraphModule(lib["default"](dev))
  45. ######################################################################
  46. # 使用TVM Runtime运行模型
  47. dtype = "float32"
  48. module.set_input(input_name, img_data)
  50. output_shape = (1, 1000)
  51. tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
  52. #~~~~~~~~~~~
  53. # 评估基本的性能数据
  54. import timeit
  55. timing_number = 10
  56. timing_repeat = 10
  57. unoptimized = (
  58. np.array(timeit.Timer(lambda:, number=timing_number))
  59. * 1000
  60. / timing_number
  61. )
  62. unoptimized = {
  63. "mean": np.mean(unoptimized),
  64. "median": np.median(unoptimized),
  65. "std": np.std(unoptimized),
  66. }
  67. print(unoptimized)
  68. #~~~~~~~~~~
  69. # 后处理
  70. from scipy.special import softmax
  71. # Download a list of labels
  72. labels_url = ""
  73. labels_path = download_testdata(labels_url, "synset.txt", module="data")
  74. with open(labels_path, "r") as f:
  75. labels = [l.rstrip() for l in f]
  76. # Open the output and read the output tensor
  77. scores = softmax(tvm_output)
  78. scores = np.squeeze(scores)
  79. ranks = np.argsort(scores)[::-1]
  80. for rank in ranks[0:5]:
  81. print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
  82. #~~~~~~~~~~~
  83. # 输出结果
  84. #
  85. # # class='n02123045 tabby, tabby cat' with probability=0.610553
  86. # # class='n02123159 tiger cat' with probability=0.367179
  87. # # class='n02124075 Egyptian cat' with probability=0.019365
  88. # # class='n02129604 tiger, Panthera tigris' with probability=0.001273
  89. # # class='n04040759 radiator' with probability=0.000261
  90. ################################################################################
  91. # 使用autotvm来优化模型
  92. #
  93. import tvm.auto_scheduler as auto_scheduler
  94. from tvm.autotvm.tuner import XGBTuner
  95. from tvm import autotvm
  96. #~~~~~~~~~~~
  97. # 基本配置参数
  98. number = 10 # 尝试的不同优化配置的数目
  99. repeat = 1 # 每个配置重复测量的次数
  100. min_repeat_ms = 0 # 最小运行时间,影响GPU精度调优,CPU设置为0
  101. timeout = 10 # 超时时间
  102. # create a TVM runner
  103. runner = autotvm.LocalRunner(
  104. number=number,
  105. repeat=repeat,
  106. timeout=timeout,
  107. min_repeat_ms=min_repeat_ms,
  108. enable_cpu_cache_flush=True,
  109. )
  110. #~~~~~~~~~~~~
  111. # 优化选项
  112. # tuner: 这里使用xgboost作为优化算法
  113. # trials:对于产品级的模型,trials推荐值为CPU 1500, GPU 3000-4000. 这个是模型和处理器相关的。
  114. # 需要多花时间实验。这里作为简单测试,仅设置为10
  115. # Tearly_stopping:尝试最小次数
  116. # measure_option: 构建运行选项
  117. # tuning_records: 输出的优化记录文件
  118. tuning_option = {
  119. "tuner": "xgb",
  120. "trials": 10,
  121. "early_stopping": 100,
  122. "measure_option": autotvm.measure_option(
  123. builder=autotvm.LocalBuilder(build_func="default"), runner=runner
  124. ),
  125. "tuning_records": "resnet-50-v2-autotuning.json",
  126. }
  127. # begin by extracting the taks from the onnx model
  128. tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)
  129. # Tune the extracted tasks sequentially.
  130. for i, task in enumerate(tasks):
  131. prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
  132. tuner_obj = XGBTuner(task, loss_type="rank")
  133. tuner_obj.tune(
  134. n_trial=min(tuning_option["trials"], len(task.config_space)),
  135. early_stopping=tuning_option["early_stopping"],
  136. measure_option=tuning_option["measure_option"],
  137. callbacks=[
  138. autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
  139. autotvm.callback.log_to_file(tuning_option["tuning_records"]),
  140. ],
  141. )
  142. #~~~~~~~
  143. # 输出实例:
  144. # # [Task 1/24] Current/Best: 10.71/ 21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done.
  145. # # [Task 1/24] Current/Best: 9.32/ 24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done.
  146. # # [Task 2/24] Current/Best: 22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done.
  147. # # [Task 3/24] Current/Best: 32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done.
  148. # ....
  149. # # [Task 24/24] Current/Best: 25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done.
  150. ################################################################################
  151. # 编译优化后的模型
  152. with autotvm.apply_history_best(tuning_option["tuning_records"]):
  153. with tvm.transform.PassContext(opt_level=3, config={}):
  154. lib =, target=target, params=params)
  155. dev = tvm.device(str(target), 0)
  156. module = graph_executor.GraphModule(lib["default"](dev))
  157. #~~~~~~~~~
  158. # 校验优化模型的结果
  159. dtype = "float32"
  160. module.set_input(input_name, img_data)
  162. output_shape = (1, 1000)
  163. tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
  164. scores = softmax(tvm_output)
  165. scores = np.squeeze(scores)
  166. ranks = np.argsort(scores)[::-1]
  167. for rank in ranks[0:5]:
  168. print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
  169. #~~~~~~~~~
  170. # 评估性能
  171. import timeit
  172. timing_number = 10
  173. timing_repeat = 10
  174. optimized = (
  175. np.array(timeit.Timer(lambda:, number=timing_number))
  176. * 1000
  177. / timing_number
  178. )
  179. optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}
  180. print("optimized: %s" % (optimized))
  181. print("unoptimized: %s" % (unoptimized))


  1. """
  2. 使用TVM的Auto Scheduling feature(Ansor)搜索优化解
  3. """
  4. import os
  5. import numpy as np
  6. import tvm
  7. from tvm import te, auto_scheduler
  8. ################################################################################
  9. # 定义矩阵乘计算
  10. # ----------------------------------
  11. # 1 函数使用 `auto_sceduler` 装饰器
  12. # 2 函数返回input/output tensors列表. Ansor据此构建计算图。
  13. @auto_scheduler.register_workload # Note the auto_scheduler decorator
  14. def matmul_add(N, L, M, dtype):
  15. A = te.placeholder((N, L), name="A", dtype=dtype)
  16. B = te.placeholder((L, M), name="B", dtype=dtype)
  17. C = te.placeholder((N, M), name="C", dtype=dtype)
  18. k = te.reduce_axis((0, L), name="k")
  19. matmul = te.compute(
  20. (N, M),
  21. lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
  22. name="matmul",
  23. attrs={"layout_free_placeholders": [B]}, # enable automatic layout transform for tensor B
  24. )
  25. out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
  26. return [A, B, C, out]
  27. ################################################################################
  28. # 创建搜索任务
  29. # ----------------------
  30. # 为了充分利用硬件特性,Target越具体越好
  31. # - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
  32. # - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
  33. target ="llvm")
  34. N = L = M = 1024
  35. task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
  36. # Inspect the computational graph
  37. print("Computational DAG:")
  38. print(task.compute_dag)
  39. #~~~~~~~~~~~~~
  40. # 设置Ansor的参数
  41. # ---------------------------------
  42. # `num_measure_trials`: 尝试次数,通常可能1000次才能收敛,为测试方便,设为10
  43. log_file = "matmul.json"
  44. tune_option = auto_scheduler.TuningOptions(
  45. num_measure_trials=10,
  46. measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
  47. verbose=2,
  48. )
  49. ################################################################################
  50. # 执行自动搜索任务
  51. task.tune(tune_option)
  52. # Apply the best schedule
  53. sch, args = task.apply_best(log_file)
  54. ################################################################################
  55. # 校验优化后的结果
  56. # ---------------------------------
  57. # auto-scheduler 执行的优化包括 multi-level tiling, layout transformation, parallelization, vectorization, unrolling, and operator fusion.
  58. print("Lowered TIR:")
  59. print(tvm.lower(sch, args, simple_mode=True))
  60. #~~~~~~~~~~~
  61. # 检验正确性,评估性能
  62. func =, args, target)
  63. a_np = np.random.uniform(size=(N, L)).astype(np.float32)
  64. b_np = np.random.uniform(size=(L, M)).astype(np.float32)
  65. c_np = np.random.uniform(size=(N, M)).astype(np.float32)
  66. out_np = + c_np
  67. dev = tvm.cpu()
  68. a_tvm = tvm.nd.array(a_np, device=dev)
  69. b_tvm = tvm.nd.array(b_np, device=dev)
  70. c_tvm = tvm.nd.array(c_np, device=dev)
  71. out_tvm = tvm.nd.empty(out_np.shape, device=dev)
  72. func(a_tvm, b_tvm, c_tvm, out_tvm)
  73. # Check results
  74. np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
  75. # Evaluate execution time.
  76. evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
  77. print(
  78. "Execution time of this operator: %.3f ms"
  79. % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
  80. )
  81. # 输出等价的Python schedule API结果
  82. print("[~~~~~~Equivalent python schedule:~~~~~]")
  83. print(task.print_best(log_file))
  84. ################################################################################
  85. # 恢复搜索
  86. def resume_search(task, log_file):
  87. print("[~~~~Resume search:~~~~~~~]")
  88. cost_model = auto_scheduler.XGBModel()
  89. cost_model.update_from_file(log_file)
  90. search_policy = auto_scheduler.SketchPolicy(
  91. task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
  92. )
  93. tune_option = auto_scheduler.TuningOptions(
  94. num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
  95. )
  96. task.tune(tune_option, search_policy=search_policy)
  97. resume_search(task, log_file)