#include "vec.h"
#define ASM_MACROS "\n\
// c = cross(a,b) \n\
// a: s0, s1, s2 \n\
// b: s3, s4, s5 \n\
// c: s19, s20, s21 \n\
.macro VecCp \n\
fmul s19, s1, s5 // s19 = ay * bz \n\
fmsub s19, s2, s4, s19 // s19 = s19 - az * by \n\
fmul s20, s2, s3 // s20 = az * bx \n\
fmsub s20, s0, s5, s20 // s20 = s20 - az * by \n\
fmul s21, s0, s4 // s21 = az * bx \n\
fmsub s21, s1, s3, s21 // s21 = s21 - az * by \n\
.endm \n\
\n\
// c[0-3] = SIMD:cross(a[0-3],b[0-3]) \n\
// a: v0, v1, v2 \n\
// b: v3, v4, v5 \n\
// c: v19, v20, v21 \n\
.macro VecCp4 \n\
fmul v19.4s, v1.4s, v5.4s // v19 = ay * bz \n\
fmls v19.4s, v2.4s, v4.4s // v19 -= - az * by \n\
fmul v20.4s, v2.4s, v3.4s // v20 = az * bx \n\
fmls v20.4s, v0.4s, v5.4s // v20 -= az * by \n\
fmul v21.4s, v0.4s, v4.4s // v21 = az * bx \n\
fmls v21.4s, v1.4s, v3.4s // v21 -= az * by \n\
.endm \n\
\n\
// c[0-3] = SIMD:cross(a[0-3],b[0-3]) \n\
// a: ( [x1],[x1+4],[x1+8] ) * 4 \n\
// b: ( [x2],[x2+4],[x2+8] ) * 4 \n\
// c: ( [x0],[x0+4],[x0+8] ) * 4 \n\
.macro VecCp4AOS \n\
ld3 {v0.4s, v1.4s, v2.4s}, [x1], 48 // load (ax,ay,az) into (v0,v1,v2); x1 += 48 \n\
ld3 {v3.4s, v4.4s, v5.4s}, [x2], 48 // load (bx,vy,vz) into (v3,v4,v5); x2 += 48 \n\
VecCp4 // c = cross(a,b) \n\
st3 {v19.4s, v20.4s, v21.4s}, [x0], 48 // store (cx,cy,cz) from (v19,v20,v21); x0 += 48 \n\
.endm \n\
\n\
// c = SIMD:cross(a,b) \n\
// ax:[x7] * 4 \n\
// ay:[x8] * 4 \n\
// az:[x9] * 4 \n\
// bx:[x10] * 4 \n\
// by:[x11] * 4 \n\
// bz:[x12] * 4 \n\
// cx:[x13] * 4 \n\
// cy:[x14] * 4 \n\
// cz:[x15] * 4 \n\
.macro VecCp4SOA \n\
ld1 {v0.4s}, [x7], 16 // v0: ax \n\
ld1 {v1.4s}, [x8], 16 // v1: ay \n\
ld1 {v2.4s}, [x9], 16 // v2: az \n\
ld1 {v3.4s}, [x10], 16 // v3: bx \n\
ld1 {v4.4s}, [x11], 16 // v4: by \n\
ld1 {v5.4s}, [x12], 16 // v5: bz \n\
VecCp4 // c = cross(a,b) \n\
st1 {v19.4s}, [x13], 16 // [x13]: cx \n\
st1 {v20.4s}, [x14], 16 // [x14]: cx \n\
st1 {v21.4s}, [x15], 16 // [x15]: cy \n\
.endm \n\
\n\
"
void CrossProdAOS_(Vector* c, const Vector* a, const Vector* b, int n) {
__asm volatile (ASM_MACROS "\n\
\n\
cmp x3, 16 // if n < 0 \n\
b.lo LSkipLoop1A // goto SkipLoop1A \n\
\n\
LLoop1A: \n\
VecCp4AOS \n\
VecCp4AOS \n\
VecCp4AOS \n\
VecCp4AOS \n\
sub x3, x3, 16 // n -= 16 \n\
cmp x3, 16 // if n >= 16 \n\
b.hs LLoop1A // goto Loop1A \n\
LSkipLoop1A: \n\
cbz x3, LDoneA \n\
LLoop2A: \n\
ldp s0, s1, [x1], 8 // (s0, s1) = (ax, ay); x1 += 8 \n\
ldr s2, [x1], 4 // s2 = az; x1 += 4 b \n\
ldp s3, s4, [x2], 8 // (s3, s4) = (bx, by); x1 += 8 \n\
ldr s5, [x2], 4 // s5 = bz; x1 += 4 \n\
VecCp \n\
stp s19, s20, [x0], 8 // [x0] = (cx, cy); x0 += 8 \n\
str s21, [x0], 4 // [x0] = cz; x0 += 4 \n\
subs x3, x3, 1 // if --x3 != 0 \n\
b.ne LLoop2A // goto Loop2A \n\
LDoneA: \n\
"
:
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", "x0", "x1", "x2", "x3"
);
}
void CrossProdSOA_(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n) {
__asm volatile ("\n\
\n\
ldp x7, x8, [x1], 16 // (x7, x8) = address of (ax, ay) \n\
ldr x9, [x1] // x9 = address of az \n\
ldp x10, x11, [x2], 16 // (x10, x11) = address of (bx, by) \n\
ldr x12, [x2] // x12 = address of bz \n\
ldp x13, x14, [x0], 16 // (x13, x14) = address of (cx, cy) \n\
ldr x15, [x0] // x15 = address of cz \n\
\n\
cmp x3, 16 // if n < 0 \n\
b.lo LSkipLoop1B // goto SkipLoop1A \n\
\n\
LLoop1B: \n\
VecCp4SOA \n\
VecCp4SOA \n\
VecCp4SOA \n\
VecCp4SOA \n\
sub x3, x3, 16 // n -= 16 \n\
cmp x3, 16 // if n >= 16 \n\
b.hs LLoop1B // goto Loop1B \n\
LSkipLoop1B: \n\
cbz x3, LDoneB \n\
LLoop2B: \n\
ldr s0, [x7], 4 // s0 = ax; x7 += 4 \n\
ldr s1, [x8], 4 // s1 = ay; x8 += 4 \n\
ldr s2, [x9], 4 // s2 = az; x9 += 4 \n\
ldr s3, [x10], 4 // s3 = bx; x10 += 4 \n\
ldr s4, [x11], 4 // s4 = by; x11 += 4 \n\
ldr s5, [x12], 4 // s5 = bz; x12 += 4 \n\
VecCp \n\
str s19, [x13], 4 // [x13] = cx; x13 += 4 \n\
str s20, [x14], 4 // [x14] = cy; x14 += 4 \n\
str s21, [x15], 4 // [x15] = cz; x15 += 4 \n\
subs x3, x3, 1 // if --n != 0 \n\
b.ne LLoop2B // goto Loop2B \n\
LDoneB: \n\
"
:
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21",
"x0", "x1", "x2", "x3",
"x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15"
);
}
|