- #ifdef __aarch64__
- #include "MNNAsmGlobal.h"
- .text // 代码段
- .align 5 // 地址对齐 2^5 = 32
- asm_function MNNAddBias
- // void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
- // x0: dst, x1: bias, x2: planeNumber, x3: biasNumber
- cmp x3, #0
- beq End
- cmp x2, #0
- beq End
- // Relu 以及 Relu6
- movi v22.4s, #0 // 立即数存入寄存器
- movi v21.4s, #6 // 6存入寄存器
- scvtf v21.4s, v21.4s // 整型转为浮点型
- LoopBias:
- ld1 {v31.4s}, [x1], #16 // 读取128位到v31寄存器,v31视为4个32位寄存器
- mov x4, x2
- // 循环展开,每次计算四个循环
- L4:
- cmp x4, #3
- ble L1
- Loop4:
- mov x5, x0
- // 指令重排,减少指令依赖
- ld1 {v0.4s, v1.4s}, [x5], #32
- fadd v0.4s, v0.4s, v31.4s
- ld1 {v2.4s, v3.4s}, [x5]
- fadd v1.4s, v1.4s, v31.4s
- fadd v2.4s, v2.4s, v31.4s
- st1 {v0.4s, v1.4s}, [x0], #32
- fadd v3.4s, v3.4s, v31.4s
- st1 {v1.4s, v3.4s}, [x0], #32
- sub x4, x4, #4
- cmp x4, #4
- bge Loop4
- // 剩余循环计算
- L1:
- cmp x4, #0
- beq EndLoopPlane
- Loop1:
- ld1 {v0.4s}, [x0]
- fadd v0.4s, v0.4s, v31.4s
- subs x4, x4, #1
- st1 {v0.4s}, [x0], #16
- bne Loop1
- EndLoopPlane:
- subs x3, x3, 1
- bne LoopBias
- End:
- ret
- #endif