1. #ifdef __aarch64__
    2. #include "MNNAsmGlobal.h"
    3. .text // 代码段
    4. .align 5 // 地址对齐 2^5 = 32
    5. asm_function MNNAddBias
    6. // void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
    7. // x0: dst, x1: bias, x2: planeNumber, x3: biasNumber
    8. cmp x3, #0
    9. beq End
    10. cmp x2, #0
    11. beq End
    12. // Relu 以及 Relu6
    13. movi v22.4s, #0 // 立即数存入寄存器
    14. movi v21.4s, #6 // 6存入寄存器
    15. scvtf v21.4s, v21.4s // 整型转为浮点型
    16. LoopBias:
    17. ld1 {v31.4s}, [x1], #16 // 读取128位到v31寄存器,v31视为4个32位寄存器
    18. mov x4, x2
    19. // 循环展开,每次计算四个循环
    20. L4:
    21. cmp x4, #3
    22. ble L1
    23. Loop4:
    24. mov x5, x0
    25. // 指令重排,减少指令依赖
    26. ld1 {v0.4s, v1.4s}, [x5], #32
    27. fadd v0.4s, v0.4s, v31.4s
    28. ld1 {v2.4s, v3.4s}, [x5]
    29. fadd v1.4s, v1.4s, v31.4s
    30. fadd v2.4s, v2.4s, v31.4s
    31. st1 {v0.4s, v1.4s}, [x0], #32
    32. fadd v3.4s, v3.4s, v31.4s
    33. st1 {v1.4s, v3.4s}, [x0], #32
    34. sub x4, x4, #4
    35. cmp x4, #4
    36. bge Loop4
    37. // 剩余循环计算
    38. L1:
    39. cmp x4, #0
    40. beq EndLoopPlane
    41. Loop1:
    42. ld1 {v0.4s}, [x0]
    43. fadd v0.4s, v0.4s, v31.4s
    44. subs x4, x4, #1
    45. st1 {v0.4s}, [x0], #16
    46. bne Loop1
    47. EndLoopPlane:
    48. subs x3, x3, 1
    49. bne LoopBias
    50. End:
    51. ret
    52. #endif