#include "Vec128.h"
void Mat4x4TransposeF32_(float* m_des, const float* m_src1) {
__asm volatile ("\n\
ld1 {v0.4s-v3.4s}, [x1] \n\
trn1 v4.4s, v0.4s, v1.4s // a0 b0 a2 b2 \n\
trn2 v5.4s, v0.4s, v1.4s // a1 b1 a3 b3 \n\
trn1 v6.4s, v2.4s, v3.4s // c0 d0 c2 d2 \n\
trn2 v7.4s, v2.4s, v3.4s // c1 d1 c3 d3 \n\
trn1 v0.2d, v4.2d, v6.2d // a0 b0 c0 d0 \n\
trn1 v1.2d, v5.2d, v7.2d // a1 b1 c1 d1 \n\
trn2 v2.2d, v4.2d, v6.2d // a2 b2 c2 d2 \n\
trn2 v3.2d, v5.2d, v7.2d // a3 b3 c3 d3 \n\
st1 {v0.4s-v3.4s}, [x0] \n\
"
:
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
|