TVM编译模型的基本步骤

image.png

  1. 导入其他框架的模型
    支持的主要框架:Tensorflow, PyTorch, ONNX

  2. 将该模型转换成Relay(TVM的高层级IR).
    Relay支持特性:

  • 传统数据流式的表示
  • 函数式语言风格的表示
  • 两种风格的混合

    在此步骤中,Relay进行图层次的优化

  1. 将Relay转换成更细粒度的Tensor Expression(TE)
    Relay使用FuseOps将模型划分成小的子图,在此过程中可以使用一些schedule原语进行优化,
    如tiling, vectorization, parallelization, unrolling, and fusion
    TOPI包含一些预定义的常用Operator。

  2. 搜索最佳调度策略(AutoTVMAutoScheduler
    TVM提供了如下两种自动调优模块。

  • AutoTVM: 模板化的自动调优模块。

    TOPI中提供了通用算子优化后的调度模板

  • AutoScheduler (a.k.a. Ansor): 无模板的自动调优模块,自动生成搜索空间

  1. 选择模型编译的最优配置。
    自动调优会生成JSON格式的优化记录.

  2. 编译生成Tensor IR (TIR, TVM的低层级IR,相对于Relay)。
    TVM支持的后端包括:

  • LLVM, 通过它可以生成llvm支持的所有硬件如x86, ARM.
  • 特定编译器,如NVCC, NVIDIA的编译器.
  • 通过BYOC(Bring Your Own Codegen)框架实现
  1. 编译生成机器代码。
    TVM可以将模型编译成可链接的对象模块来通过轻量级的运行时来运行。它提供多种语言的支持。
    TVM也支持将模型和运行时统一打包。

Relay的使用

Relay是TVM中定义的High-level IR。
通过TVM中Relay模块,我们可以转换其他框架模型,并生成对应后端代码。

  1. import numpy as np
  2. from tvm import relay
  3. from tvm.relay import testing
  4. import tvm
  5. from tvm import te
  6. from tvm.contrib import graph_executor
  7. import tvm.testing
  8. ######################################################################
  9. # (1) 导入模型:
  10. # 这里我们使用了预编译的resnet_18
  11. batch_size = 1
  12. num_class = 1000
  13. image_shape = (3, 224, 224)
  14. data_shape = (batch_size,) + image_shape
  15. out_shape = (batch_size, num_class)
  16. mod, params = relay.testing.resnet.get_workload(
  17. num_layers=18, batch_size=batch_size, image_shape=image_shape
  18. )
  19. # set show_meta_data=True if you want to show meta data
  20. print(mod.astext(show_meta_data=False))
  21. ######################################################################
  22. # (2) 编译
  23. # -----------
  24. # - 优化等级范围 (0 to 3).
  25. # - 优化passes: operator fusion, pre-computation, layout transformation...
  26. # Relay进行graph-level优化, TVM进行tensor-level优化
  27. #
  28. opt_level = 3
  29. target = tvm.target.Target(target="llvm", host="llvm") #tvm.target.cuda()
  30. with tvm.transform.PassContext(opt_level=opt_level):
  31. lib = relay.build(mod, target, params=params)
  32. #####################################################################
  33. #(3) 运行
  34. dev = tvm.device(target.kind.name, 0) #tvm.cuda()
  35. data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
  36. module = graph_executor.GraphModule(lib["default"](dev))
  37. module.set_input("data", data)
  38. module.run()
  39. out = module.get_output(0, tvm.nd.empty(out_shape)).numpy()
  40. print(out.flatten()[0:10])
  41. ######################################################################
  42. # (4) 保存并加载模型
  43. from tvm.contrib import utils
  44. temp = utils.tempdir()
  45. path_lib = temp.relpath("deploy_lib.tar")
  46. lib.export_library(path_lib)
  47. print(temp.listdir())
  48. ####################################################
  49. loaded_lib = tvm.runtime.load_module(path_lib)
  50. input_data = tvm.nd.array(data)
  51. module = graph_executor.GraphModule(loaded_lib["default"](dev))
  52. module.run(data=input_data)
  53. out_deploy = module.get_output(0).numpy()
  54. print(out_deploy.flatten()[0:10])
  55. # 比较结果
  56. tvm.testing.assert_allclose(out_deploy, out, atol=1e-5)

TOPI的使用

TOPI 提供了很多实用算子和针对特性后端的调度器。

  1. """
  2. TVM Operator Inventory (TOPI)提供了numpy风格的高级抽象算子.
  3. """
  4. from __future__ import absolute_import, print_function
  5. import tvm
  6. import tvm.testing
  7. from tvm import te
  8. from tvm import topi
  9. import numpy as np
  10. ######################################################################
  11. # 基础实例:按行求和(等价numpy代码:`B = numpy.sum(A, axis=1)`)
  12. #
  13. n = te.var("n")
  14. m = te.var("m")
  15. A = te.placeholder((n, m), name="A")
  16. k = te.reduce_axis((0, m), "k")
  17. B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
  18. s = te.create_schedule(B.op)
  19. ######################################################################
  20. # 查看生成的IR
  21. print(tvm.lower(s, [A], simple_mode=True))
  22. ######################################################################
  23. # 使用预定义topi方法
  24. C = topi.sum(A, axis=1)
  25. ts = te.create_schedule(C.op)
  26. print(tvm.lower(ts, [A], simple_mode=True))
  27. ######################################################################
  28. # Numpy-style 广播操作
  29. x, y = 100, 10
  30. a = te.placeholder((x, y, y), name="a")
  31. b = te.placeholder((y, y), name="b")
  32. c = a + b # same as topi.broadcast_add
  33. d = a * b # same as topi.broadcast_mul
  34. ######################################################################
  35. # Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`.
  36. ######################################################################
  37. # TOPI定义了针对不同平台的调度器,如CUDA,x86...
  38. #
  39. e = topi.elemwise_sum([c, d])
  40. f = e / 2.0
  41. g = topi.sum(f)
  42. with tvm.target.Target(target="llvm", host="llvm"):
  43. # sg = topi.cuda.schedule_reduce(g)
  44. sg = topi.x86.schedule_reduce(g)
  45. print(tvm.lower(sg, [a, b], simple_mode=True))
  46. ######################################################################
  47. # As you can see, scheduled stages of computation have been accumulated and we can examine them by
  48. print(sg.stages)
  49. ######################################################################
  50. # 和numpy运算结果做比较
  51. func = tvm.build(sg, [a, b, g], "llvm")
  52. dev = tvm.device("llvm", 0)
  53. a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
  54. b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
  55. g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
  56. a_nd = tvm.nd.array(a_np, dev)
  57. b_nd = tvm.nd.array(b_np, dev)
  58. g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
  59. func(a_nd, b_nd, g_nd)
  60. tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5)
  61. ######################################################################
  62. # TOPI 提供了常用的神经网络算子,如softmax,conv
  63. ######################################################################
  64. # NOTE: TOPI的实现因后端而异,目标平台和调度器必须一致
  65. tarray = te.placeholder((512, 512), name="tarray")
  66. softmax_topi = topi.nn.softmax(tarray)
  67. with tvm.target.Target(target="llvm", host="llvm"):
  68. # sst = topi.cuda.schedule_softmax(softmax_topi)
  69. sst = topi.x86.schedule_softmax(softmax_topi)
  70. print(tvm.lower(sst, [tarray], simple_mode=True))
  71. data = te.placeholder((1, 3, 224, 224))
  72. kernel = te.placeholder((10, 3, 5, 5))
  73. with tvm.target.Target(target="llvm", host="llvm"):
  74. conv = topi.nn.conv2d_nchw(data, kernel, 1, 2, 1)
  75. out = topi.nn.relu(conv)
  76. # sconv = topi.cuda.schedule_conv2d_nchw([out])
  77. sconv = topi.x86.schedule_conv2d_nchw([out])
  78. print(tvm.lower(sconv, [data, kernel], simple_mode=True))

模型的编译、运行和优化

  1. """
  2. 使用TVM的python API来进行模型的编译、运行和优化
  3. """
  4. import onnx
  5. from tvm.contrib.download import download_testdata
  6. from PIL import Image
  7. import numpy as np
  8. import tvm.relay as relay
  9. import tvm
  10. from tvm.contrib import graph_executor
  11. ###############################################################################
  12. # 获取一个预训练的模型(使用了一个ONNX格式的模型)和测试图片
  13. model_url = "".join(
  14. [
  15. "https://github.com/onnx/models/raw/",
  16. "master/vision/classification/resnet/model/",
  17. "resnet50-v2-7.onnx",
  18. ]
  19. )
  20. model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
  21. onnx_model = onnx.load(model_path)
  22. img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
  23. img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
  24. # Resize it to 224x224
  25. resized_image = Image.open(img_path).resize((224, 224))
  26. img_data = np.asarray(resized_image).astype("float32")
  27. # Our input image is in HWC layout while ONNX expects CHW input, so convert the array
  28. img_data = np.transpose(img_data, (2, 0, 1))
  29. # Normalize according to the ImageNet input specification
  30. imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
  31. imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
  32. norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev
  33. # Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
  34. img_data = np.expand_dims(norm_img_data, axis=0)
  35. ###############################################################################
  36. # 使用Relay来编译模型
  37. target = "llvm"
  38. input_name = "data"
  39. shape_dict = {input_name: img_data.shape}
  40. mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
  41. with tvm.transform.PassContext(opt_level=3):
  42. lib = relay.build(mod, target=target, params=params)
  43. dev = tvm.device(str(target), 0)
  44. module = graph_executor.GraphModule(lib["default"](dev))
  45. ######################################################################
  46. # 使用TVM Runtime运行模型
  47. dtype = "float32"
  48. module.set_input(input_name, img_data)
  49. module.run()
  50. output_shape = (1, 1000)
  51. tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
  52. #~~~~~~~~~~~
  53. # 评估基本的性能数据
  54. import timeit
  55. timing_number = 10
  56. timing_repeat = 10
  57. unoptimized = (
  58. np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
  59. * 1000
  60. / timing_number
  61. )
  62. unoptimized = {
  63. "mean": np.mean(unoptimized),
  64. "median": np.median(unoptimized),
  65. "std": np.std(unoptimized),
  66. }
  67. print(unoptimized)
  68. #~~~~~~~~~~
  69. # 后处理
  70. from scipy.special import softmax
  71. # Download a list of labels
  72. labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
  73. labels_path = download_testdata(labels_url, "synset.txt", module="data")
  74. with open(labels_path, "r") as f:
  75. labels = [l.rstrip() for l in f]
  76. # Open the output and read the output tensor
  77. scores = softmax(tvm_output)
  78. scores = np.squeeze(scores)
  79. ranks = np.argsort(scores)[::-1]
  80. for rank in ranks[0:5]:
  81. print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
  82. #~~~~~~~~~~~
  83. # 输出结果
  84. #
  85. # # class='n02123045 tabby, tabby cat' with probability=0.610553
  86. # # class='n02123159 tiger cat' with probability=0.367179
  87. # # class='n02124075 Egyptian cat' with probability=0.019365
  88. # # class='n02129604 tiger, Panthera tigris' with probability=0.001273
  89. # # class='n04040759 radiator' with probability=0.000261
  90. ################################################################################
  91. # 使用autotvm来优化模型
  92. #
  93. import tvm.auto_scheduler as auto_scheduler
  94. from tvm.autotvm.tuner import XGBTuner
  95. from tvm import autotvm
  96. #~~~~~~~~~~~
  97. # 基本配置参数
  98. number = 10 # 尝试的不同优化配置的数目
  99. repeat = 1 # 每个配置重复测量的次数
  100. min_repeat_ms = 0 # 最小运行时间,影响GPU精度调优,CPU设置为0
  101. timeout = 10 # 超时时间
  102. # create a TVM runner
  103. runner = autotvm.LocalRunner(
  104. number=number,
  105. repeat=repeat,
  106. timeout=timeout,
  107. min_repeat_ms=min_repeat_ms,
  108. enable_cpu_cache_flush=True,
  109. )
  110. #~~~~~~~~~~~~
  111. # 优化选项
  112. # tuner: 这里使用xgboost作为优化算法
  113. # trials:对于产品级的模型,trials推荐值为CPU 1500, GPU 3000-4000. 这个是模型和处理器相关的。
  114. # 需要多花时间实验。这里作为简单测试,仅设置为10
  115. # Tearly_stopping:尝试最小次数
  116. # measure_option: 构建运行选项
  117. # tuning_records: 输出的优化记录文件
  118. tuning_option = {
  119. "tuner": "xgb",
  120. "trials": 10,
  121. "early_stopping": 100,
  122. "measure_option": autotvm.measure_option(
  123. builder=autotvm.LocalBuilder(build_func="default"), runner=runner
  124. ),
  125. "tuning_records": "resnet-50-v2-autotuning.json",
  126. }
  127. # begin by extracting the taks from the onnx model
  128. tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)
  129. # Tune the extracted tasks sequentially.
  130. for i, task in enumerate(tasks):
  131. prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
  132. tuner_obj = XGBTuner(task, loss_type="rank")
  133. tuner_obj.tune(
  134. n_trial=min(tuning_option["trials"], len(task.config_space)),
  135. early_stopping=tuning_option["early_stopping"],
  136. measure_option=tuning_option["measure_option"],
  137. callbacks=[
  138. autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
  139. autotvm.callback.log_to_file(tuning_option["tuning_records"]),
  140. ],
  141. )
  142. #~~~~~~~
  143. # 输出实例:
  144. # # [Task 1/24] Current/Best: 10.71/ 21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done.
  145. # # [Task 1/24] Current/Best: 9.32/ 24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done.
  146. # # [Task 2/24] Current/Best: 22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done.
  147. # # [Task 3/24] Current/Best: 32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done.
  148. # ....
  149. # # [Task 24/24] Current/Best: 25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done.
  150. ################################################################################
  151. # 编译优化后的模型
  152. with autotvm.apply_history_best(tuning_option["tuning_records"]):
  153. with tvm.transform.PassContext(opt_level=3, config={}):
  154. lib = relay.build(mod, target=target, params=params)
  155. dev = tvm.device(str(target), 0)
  156. module = graph_executor.GraphModule(lib["default"](dev))
  157. #~~~~~~~~~
  158. # 校验优化模型的结果
  159. dtype = "float32"
  160. module.set_input(input_name, img_data)
  161. module.run()
  162. output_shape = (1, 1000)
  163. tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
  164. scores = softmax(tvm_output)
  165. scores = np.squeeze(scores)
  166. ranks = np.argsort(scores)[::-1]
  167. for rank in ranks[0:5]:
  168. print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
  169. #~~~~~~~~~
  170. # 评估性能
  171. import timeit
  172. timing_number = 10
  173. timing_repeat = 10
  174. optimized = (
  175. np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
  176. * 1000
  177. / timing_number
  178. )
  179. optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}
  180. print("optimized: %s" % (optimized))
  181. print("unoptimized: %s" % (unoptimized))

使用Ansor优化模型

  1. """
  2. 使用TVM的Auto Scheduling feature(Ansor)搜索优化解
  3. """
  4. import os
  5. import numpy as np
  6. import tvm
  7. from tvm import te, auto_scheduler
  8. ################################################################################
  9. # 定义矩阵乘计算
  10. # ----------------------------------
  11. # 1 函数使用 `auto_sceduler` 装饰器
  12. # 2 函数返回input/output tensors列表. Ansor据此构建计算图。
  13. @auto_scheduler.register_workload # Note the auto_scheduler decorator
  14. def matmul_add(N, L, M, dtype):
  15. A = te.placeholder((N, L), name="A", dtype=dtype)
  16. B = te.placeholder((L, M), name="B", dtype=dtype)
  17. C = te.placeholder((N, M), name="C", dtype=dtype)
  18. k = te.reduce_axis((0, L), name="k")
  19. matmul = te.compute(
  20. (N, M),
  21. lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
  22. name="matmul",
  23. attrs={"layout_free_placeholders": [B]}, # enable automatic layout transform for tensor B
  24. )
  25. out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
  26. return [A, B, C, out]
  27. ################################################################################
  28. # 创建搜索任务
  29. # ----------------------
  30. # 为了充分利用硬件特性,Target越具体越好
  31. # - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
  32. # - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
  33. target = tvm.target.Target("llvm")
  34. N = L = M = 1024
  35. task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
  36. # Inspect the computational graph
  37. print("Computational DAG:")
  38. print(task.compute_dag)
  39. #~~~~~~~~~~~~~
  40. # 设置Ansor的参数
  41. # ---------------------------------
  42. # `num_measure_trials`: 尝试次数,通常可能1000次才能收敛,为测试方便,设为10
  43. log_file = "matmul.json"
  44. tune_option = auto_scheduler.TuningOptions(
  45. num_measure_trials=10,
  46. measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
  47. verbose=2,
  48. )
  49. ################################################################################
  50. # 执行自动搜索任务
  51. task.tune(tune_option)
  52. # Apply the best schedule
  53. sch, args = task.apply_best(log_file)
  54. ################################################################################
  55. # 校验优化后的结果
  56. # ---------------------------------
  57. # auto-scheduler 执行的优化包括 multi-level tiling, layout transformation, parallelization, vectorization, unrolling, and operator fusion.
  58. print("Lowered TIR:")
  59. print(tvm.lower(sch, args, simple_mode=True))
  60. #~~~~~~~~~~~
  61. # 检验正确性,评估性能
  62. func = tvm.build(sch, args, target)
  63. a_np = np.random.uniform(size=(N, L)).astype(np.float32)
  64. b_np = np.random.uniform(size=(L, M)).astype(np.float32)
  65. c_np = np.random.uniform(size=(N, M)).astype(np.float32)
  66. out_np = a_np.dot(b_np) + c_np
  67. dev = tvm.cpu()
  68. a_tvm = tvm.nd.array(a_np, device=dev)
  69. b_tvm = tvm.nd.array(b_np, device=dev)
  70. c_tvm = tvm.nd.array(c_np, device=dev)
  71. out_tvm = tvm.nd.empty(out_np.shape, device=dev)
  72. func(a_tvm, b_tvm, c_tvm, out_tvm)
  73. # Check results
  74. np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
  75. # Evaluate execution time.
  76. evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
  77. print(
  78. "Execution time of this operator: %.3f ms"
  79. % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
  80. )
  81. # 输出等价的Python schedule API结果
  82. print("[~~~~~~Equivalent python schedule:~~~~~]")
  83. print(task.print_best(log_file))
  84. ################################################################################
  85. # 恢复搜索
  86. def resume_search(task, log_file):
  87. print("[~~~~Resume search:~~~~~~~]")
  88. cost_model = auto_scheduler.XGBModel()
  89. cost_model.update_from_file(log_file)
  90. search_policy = auto_scheduler.SketchPolicy(
  91. task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
  92. )
  93. tune_option = auto_scheduler.TuningOptions(
  94. num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
  95. )
  96. task.tune(tune_option, search_policy=search_policy)
  97. resume_search(task, log_file)