#ifdef __aarch64__#include "MNNAsmGlobal.h".text // 代码段.align 5 // 地址对齐 2^5 = 32asm_function MNNAddBias// void MNNAddBias(float* dst, const float* bias, size_t planeNumber, size_t biasNumber) {// x0: dst, x1: bias, x2: planeNumber, x3: biasNumbercmp x3, #0beq Endcmp x2, #0beq End// Relu 以及 Relu6movi v22.4s, #0 // 立即数存入寄存器movi v21.4s, #6 // 6存入寄存器scvtf v21.4s, v21.4s // 整型转为浮点型LoopBias:ld1 {v31.4s}, [x1], #16 // 读取128位到v31寄存器,v31视为4个32位寄存器mov x4, x2// 循环展开,每次计算四个循环L4:cmp x4, #3ble L1Loop4:mov x5, x0// 指令重排,减少指令依赖ld1 {v0.4s, v1.4s}, [x5], #32fadd v0.4s, v0.4s, v31.4sld1 {v2.4s, v3.4s}, [x5]fadd v1.4s, v1.4s, v31.4sfadd v2.4s, v2.4s, v31.4sst1 {v0.4s, v1.4s}, [x0], #32fadd v3.4s, v3.4s, v31.4sst1 {v1.4s, v3.4s}, [x0], #32sub x4, x4, #4cmp x4, #4bge Loop4// 剩余循环计算L1:cmp x4, #0beq EndLoopPlaneLoop1:ld1 {v0.4s}, [x0]fadd v0.4s, v0.4s, v31.4ssubs x4, x4, #1st1 {v0.4s}, [x0], #16bne Loop1EndLoopPlane:subs x3, x3, 1bne LoopBiasEnd:ret#endif