#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text // 代码段
.align 5 // 地址对齐 2^5 = 32
asm_function MNNAddBias
// void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {
// x0: dst, x1: bias, x2: planeNumber, x3: biasNumber
cmp x3, #0
beq End
cmp x2, #0
beq End
// Relu 以及 Relu6
movi v22.4s, #0 // 立即数存入寄存器
movi v21.4s, #6 // 6存入寄存器
scvtf v21.4s, v21.4s // 整型转为浮点型
LoopBias:
ld1 {v31.4s}, [x1], #16 // 读取128位到v31寄存器,v31视为4个32位寄存器
mov x4, x2
// 循环展开,每次计算四个循环
L4:
cmp x4, #3
ble L1
Loop4:
mov x5, x0
// 指令重排,减少指令依赖
ld1 {v0.4s, v1.4s}, [x5], #32
fadd v0.4s, v0.4s, v31.4s
ld1 {v2.4s, v3.4s}, [x5]
fadd v1.4s, v1.4s, v31.4s
fadd v2.4s, v2.4s, v31.4s
st1 {v0.4s, v1.4s}, [x0], #32
fadd v3.4s, v3.4s, v31.4s
st1 {v1.4s, v3.4s}, [x0], #32
sub x4, x4, #4
cmp x4, #4
bge Loop4
// 剩余循环计算
L1:
cmp x4, #0
beq EndLoopPlane
Loop1:
ld1 {v0.4s}, [x0]
fadd v0.4s, v0.4s, v31.4s
subs x4, x4, #1
st1 {v0.4s}, [x0], #16
bne Loop1
EndLoopPlane:
subs x3, x3, 1
bne LoopBias
End:
ret
#endif