#include "Vec128.h"
//#define UpdateSums(VregX, VregY) \
// fadd v16.4s, v16.4s, \VregX\().4s // update sum_x \n\
// fadd v17.4s, v17.4s, \VregX\().4s // update sum_y \n\
void CalcCorrCoef_(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon) {
__asm volatile ("\n\
\n\
.macro UpdateSums VregX, VregY \n\
fadd.4s v16, v16, \\VregX\\() // sum_x: v16 += VregX \n\
fadd.4s v17, v17, \\VregY\\() // sum_y: v17 += VregX \n\
fmla.4s v18, \\VregX\\(), \\VregX\\() // sum_xx: v18 += VregX^2 \n\
fmla.4s v19, \\VregY\\(), \\VregY\\() // sum_yy: v19 += VregY^2 \n\
fmla.4s v20, \\VregX\\(), \\VregY\\() // sum_xy: v20 += VregX * VregY \n\
.endm \n\
\n\
cbz x4, LInvalidArg // if n == 0 goto END \n\
tst x2, 0x0f // if (x2 != 15) \n\
b.ne LInvalidArg // goto END \n\
tst x3, 0x0f // if (x3 != 15) \n\
b.ne LInvalidArg // goto END \n\
mov x5, x4 // save n to x5 \n\
\n\
eor v16.16b, v16.16b, v16.16b // sum_x = 0 \n\
eor v17.16b, v17.16b, v17.16b // sum_y = 0 \n\
eor v18.16b, v18.16b, v18.16b // sum_xx = 0 \n\
eor v19.16b, v19.16b, v19.16b // sum_yy = 0 \n\
eor v20.16b, v20.16b, v20.16b // sum_xy = 0 \n\
\n\
cmp x4, 16 // if n<=16 \n\
b.lo LSkipLoop1 // goto SkipLoop1 \n\
\n\
LLoop1: \n\
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 // load x[0:16] \n\
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x3], 64 // load y[0:16] \n\
\n\
UpdateSums v0, v4 \n\
UpdateSums v1, v5 \n\
UpdateSums v2, v6 \n\
UpdateSums v3, v7 \n\
sub x4, x4, 16 // n -= 16 \n\
cmp x4, 16 // if x4 >= 16 \n\
b.hs LLoop1 // goto Loop \n\
\n\
LSkipLoop1: \n\
faddp v16.4s, v16.4s, v16.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
faddp v16.4s, v16.4s, v16.4s // s16 = lane0+lane1 \n\
faddp v17.4s, v17.4s, v17.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
faddp v17.4s, v17.4s, v17.4s // s17 = lane0+lane1 \n\
faddp v18.4s, v18.4s, v18.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
faddp v18.4s, v18.4s, v18.4s // s18 = lane0+lane1 \n\
faddp v19.4s, v19.4s, v19.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
faddp v19.4s, v19.4s, v19.4s // s19 = lane0+lane1 \n\
faddp v20.4s, v20.4s, v20.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
faddp v20.4s, v20.4s, v20.4s // s20 = lane0+lane1 \n\
cbz x4, LSkipLoop2 // if x4==0 goto SkipLoop2 \n\
\n\
LLoop2: \n\
ldr s1, [x2], 4 // s1 = x; x+=4 \n\
ldr s2, [x3], 4 // s2 = y; y+=4 \n\
fadd s16, s16, s1 // s16 += s1 \n\
fadd s17, s17, s2 // s17 += s2 \n\
fmla.4s v18, v1, v1[0] // v18 += s1 * s1 \n\
fmla.4s v19, v2, v2[0] // f19 += s2 * s2 \n\
fmla.4s v20, v1, v2[0] // f20 += s1 * s2 \n\
subs x4, x4, 1 // if (--n !=0) \n\
b.ne LLoop2 // goto Loop2 \n\
\n\
LSkipLoop2: \n\
stp s16, s17, [x1], 8 // [x1]=s16,s17; x1+=8 \n\
stp s18, s19, [x1], 8 // [x1]=s18,s19; x1+=8 \n\
str s20, [x1] // [x1]=s20; x1+=8 \n\
\n\
// rho numerator \n\
scvtf s21, x5 // s21 = n \n\
fmul s1, s21, s20 // s1 = n * sum_xy \n\
fmls.4s v1, v16, v17[0] // s1 -= sum_x * sum_y \n\
\n\
// rho denominator \n\
fmul s2, s21, s18 // s2 = n * sum_xx \n\
fmsub s2, s16, s16, s2 // s2 = s2 - sum_x * sum_x \n \
fsqrt s2, s2 // s2 = sqrt(s2) \n\
\n\
fmul s3, s21, s19 // s2 = n * sum_yy \n\
fmsub s3, s17, s17, s3 // s2 = s3 - sum_y * sum_y \n\
fsqrt s3, s3 // s3 = sqrt(s3) \n\
\n\
fmul s4, s2, s3 // s4 = s2 * s3 \n\
fcmp s4, s0 // if rho_den < epsilon \n\
b.lo LBadRhoDen // goto BadRhoDen \n\
\n\
fdiv s5, s1, s4 // s5 = rho \n\
str s5, [x0] // [x0] = s5 \n\
mov w0, 1 // return code: success \n\
ret \n\
\n\
LBadRhoDen: \n\
eor v5.16b, v5.16b, v5.16b // rho = 0 \n\
str s5, [x0] // [x0] = rho \n\
mov w0, 0 // return code: fail \n\
ret \n\
\n\
LInvalidArg: \n\
mov w0, 0 // return code: fail \n\
ret \n\
LReturn: \n\
"
:
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "x0", "x1"
);
}
|