Modern Arm Assembly Language Programming: Covers Armv8-A 32-bit, 64-bit, and SIMD

by Daniel Kusswurm
2021.07.28: updated by
Up

Chapter 16: Armv8-64 Advanced SIMD Programming

Vector and Matrix Operations

Vector Cross Products

NEON を使って、4次元ベクトルの外積計算を行うプログラムの説明

ch16_02

ch16_02/vec.h
#pragma once

// Simple vector structure
typedef struct {
    float X;        // X component
    float Y;        // Y component
    float Z;        // Z component
} Vector;

// Vector structure of arrays
typedef struct {
    float* X;       // X components
    float* Y;       // Y components
    float* Z;       // Z components
} VectorSoA;
ch16_02/main.cpp
#include <iostream>
#include <iomanip>
#include <memory>
#include <random>
#include "AlignedMem.h"
#include "vec.h"

#define C_ALIGN	16
#define SEP	" "
#define N_VEC	18
#define EPS     1.0e-9

#define NEQ(x, y)	(fabs((x) - (y)) > EPS))

using namespace std;

extern void CrossProdAOS_(Vector* c, const Vector* a, const Vector* b, int n);
extern void CrossProdSOA_(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n);

void CrossProdAOS(Vector* c, const Vector* a, const Vector* b, int n) {
    for (size_t i = 0; i < n; i++) {
        c[i].X = a[i].Y * b[i].Z - a[i].Z * b[i].Y;
        c[i].Y = a[i].Z * b[i].X - a[i].X * b[i].Z;
        c[i].Z = a[i].X * b[i].Y - a[i].Y * b[i].X;
    }
}

void CrossProdSOA(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n) {
    for (size_t i = 0; i < n; i++) {
        c.X[i] = a.Y[i] * b.Z[i] - a.Z[i] * b.Y[i];
        c.Y[i] = a.Z[i] * b.X[i] - a.X[i] * b.Z[i];
        c.Z[i] = a.X[i] * b.Y[i] - a.Y[i] * b.X[i];
    }
}

void InitVec(Vector* p, VectorSoA& q, int n, unsigned short int seed) {
  std::mt19937 mt {seed};
  std::uniform_int_distribution<> rand100 {1, 100};

  for (int i=0; i < n; i++) {
    p[i].X = q.X[i] = (float) rand100(mt);
    p[i].Y = q.Y[i] = (float) rand100(mt);
    p[i].Z = q.Z[i] = (float) rand100(mt);
  }
}


bool CompareCP(Vector* p, Vector* q, int n) {
  for (int i = 0; i < n; i++) {
    if (fabs(p[i].X - q[i].X) > EPS || fabs(p[i].Y - q[i].Y) > EPS || fabs(p[i].Z - q[i].Z) > EPS)
      return false;
  }
  return true;
}

bool CompareCP(VectorSoA& p, VectorSoA& q, int n) {
  for (int i = 0; i < n; i++) {
    if (fabs(p.X[i] - q.X[i]) > EPS || fabs(p.Y[i] - q.Y[i]) > EPS || fabs(p.Z[i] - q.Z[i]) > EPS)
      return false;
  }
  return true;
}

bool CompareCP(Vector* p, VectorSoA& q, int n) {
  for (int i = 0; i < n; i++) {
    if (fabs(p[i].X - q.X[i]) > EPS || fabs(p[i].Y - q.Y[i]) > EPS || fabs(p[i].Z - q.Z[i]) > EPS)
      return false;
  }
  return true;
}

void CrossProd(void) {
    unique_ptr<Vector> a_aos_up { new Vector[N_VEC] };
    unique_ptr<Vector> b_aos_up { new Vector[N_VEC] };
    unique_ptr<Vector> c1_aos_up { new Vector[N_VEC] };
    unique_ptr<Vector> c2_aos_up { new Vector[N_VEC] };
    Vector* a_aos = a_aos_up.get();
    Vector* b_aos = b_aos_up.get();
    Vector* c1_aos = c1_aos_up.get();
    Vector* c2_aos = c2_aos_up.get();
    AlignedArray<float> a_soa_x_aa(N_VEC, C_ALIGN);
    AlignedArray<float> a_soa_y_aa(N_VEC, C_ALIGN);
    AlignedArray<float> a_soa_z_aa(N_VEC, C_ALIGN);
    AlignedArray<float> b_soa_x_aa(N_VEC, C_ALIGN);
    AlignedArray<float> b_soa_y_aa(N_VEC, C_ALIGN);
    AlignedArray<float> b_soa_z_aa(N_VEC, C_ALIGN);
    AlignedArray<float> c1_soa_x_aa(N_VEC, C_ALIGN);
    AlignedArray<float> c1_soa_y_aa(N_VEC, C_ALIGN);
    AlignedArray<float> c1_soa_z_aa(N_VEC, C_ALIGN);
    AlignedArray<float> c2_soa_x_aa(N_VEC, C_ALIGN);
    AlignedArray<float> c2_soa_y_aa(N_VEC, C_ALIGN);
    AlignedArray<float> c2_soa_z_aa(N_VEC, C_ALIGN);
    VectorSoA a_soa, b_soa, c1_soa, c2_soa;
    a_soa.X = a_soa_x_aa.Data();
    a_soa.Y = a_soa_y_aa.Data();
    a_soa.Z = a_soa_z_aa.Data();
    b_soa.X = b_soa_x_aa.Data();
    b_soa.Y = b_soa_y_aa.Data();
    b_soa.Z = b_soa_z_aa.Data();
    c1_soa.X = c1_soa_x_aa.Data();
    c1_soa.Y = c1_soa_y_aa.Data();
    c1_soa.Z = c1_soa_z_aa.Data();
    c2_soa.X = c2_soa_x_aa.Data();
    c2_soa.Y = c2_soa_y_aa.Data();
    c2_soa.Z = c2_soa_z_aa.Data();


    InitVec(a_aos, a_soa, N_VEC, 2021);
    InitVec(b_aos, b_soa, N_VEC, 2022);

    CrossProdAOS(c1_aos, a_aos, b_aos, N_VEC);
    CrossProdAOS_(c2_aos, a_aos, b_aos, N_VEC);
    CrossProdSOA(c1_soa, a_soa, b_soa, N_VEC);
    CrossProdSOA_(c2_soa, a_soa, b_soa, N_VEC);
    bool compare_cp = CompareCP(c1_aos, c2_aos, N_VEC)
      && CompareCP(c1_soa, c2_soa, N_VEC)
      && CompareCP(c1_aos, c1_soa, N_VEC);
    
    cout << "Results for CrossProd\n";
    cout << fixed << setprecision(1);
    for (size_t i = 0; i < N_VEC; i++) {
        const unsigned int w = 7;
        cout << "Vector cross product #" << i << endl;
        cout << "  a:      ";
        cout << setw(w) << a_aos[i].X << SEP;
        cout << setw(w) << a_aos[i].Y << SEP;
        cout << setw(w) << a_aos[i].Z << SEP;
        cout << "  b: ";
        cout << setw(w) << b_aos[i].X << SEP;
        cout << setw(w) << b_aos[i].Y << SEP;
        cout << setw(w) << b_aos[i].Z << endl;
        if (compare_cp)
            cout << "  c:      ";
        else
            cout << "  c1_aos: ";
        cout << setw(w) << c1_aos[i].X << SEP;
        cout << setw(w) << c1_aos[i].Y << SEP;
        cout << setw(w) << c1_aos[i].Z << endl;
        if (!compare_cp) {
            cout << "  c2_aos: ";
            cout << setw(w) << c2_aos[i].X << SEP;
            cout << setw(w) << c2_aos[i].Y << SEP;
            cout << setw(w) << c2_aos[i].Z << endl;
            cout << "  c1_soa: ";
            cout << setw(w) << c1_soa.X[i] << SEP;
            cout << setw(w) << c1_soa.Y[i] << SEP;
            cout << setw(w) << c1_soa.Z[i] << endl;
            cout << "  c2_soa: ";
            cout << setw(w) << c2_soa.X[i] << SEP;
            cout << setw(w) << c2_soa.Y[i] << SEP;
            cout << setw(w) << c2_soa.Z[i] << endl;
        }
    }
}

int main() {
    CrossProd();
    //CrossProd_BM();
    return 0;
}
ch16_02/neon.cpp
#include "vec.h"

#define ASM_MACROS	"\n\
	// c = cross(a,b)                                                                            \n\
	//   a: s0, s1, s2                                                                           \n\
	//   b: s3, s4, s5                                                                           \n\
	//   c: s19, s20, s21                                                                        \n\
	.macro	VecCp                                                                                \n\
	fmul	s19, s1, s5                 // s19 = ay * bz                                         \n\
	fmsub	s19, s2, s4, s19            // s19 = s19 - az * by                                   \n\
	fmul	s20, s2, s3                 // s20 = az * bx                                         \n\
	fmsub	s20, s0, s5, s20            // s20 = s20 - az * by                                   \n\
	fmul	s21, s0, s4                 // s21 = az * bx                                         \n\
	fmsub	s21, s1, s3, s21            // s21 = s21 - az * by                                   \n\
	.endm	                                                                                     \n\
		                                                                                     \n\
	// c[0-3] = SIMD:cross(a[0-3],b[0-3])                                                        \n\
	//   a: v0, v1, v2                                                                           \n\
	//   b: v3, v4, v5                                                                           \n\
	//   c: v19, v20, v21                                                                        \n\
	.macro	VecCp4                                                                               \n\
	fmul	v19.4s, v1.4s, v5.4s        // v19 = ay * bz                                         \n\
	fmls	v19.4s, v2.4s, v4.4s        // v19 -= - az * by                                      \n\
	fmul	v20.4s, v2.4s, v3.4s        // v20 = az * bx                                         \n\
	fmls	v20.4s, v0.4s, v5.4s        // v20 -= az * by                                        \n\
	fmul	v21.4s, v0.4s, v4.4s        // v21 = az * bx                                         \n\
	fmls	v21.4s, v1.4s, v3.4s        // v21 -= az * by                                        \n\
	.endm	                                                                                     \n\
		                                                                                     \n\
	// c[0-3] = SIMD:cross(a[0-3],b[0-3])                                                        \n\
	//   a: ( [x1],[x1+4],[x1+8] ) * 4                                                           \n\
	//   b: ( [x2],[x2+4],[x2+8] ) * 4                                                           \n\
	//   c: ( [x0],[x0+4],[x0+8] ) * 4                                                           \n\
	.macro	VecCp4AOS                                                                            \n\
	ld3	{v0.4s, v1.4s, v2.4s}, [x1], 48     // load (ax,ay,az) into (v0,v1,v2); x1 += 48     \n\
	ld3	{v3.4s, v4.4s, v5.4s}, [x2], 48     // load (bx,vy,vz) into (v3,v4,v5); x2 += 48     \n\
	VecCp4                                      // c = cross(a,b)                                \n\
	st3	{v19.4s, v20.4s, v21.4s}, [x0], 48  // store (cx,cy,cz) from (v19,v20,v21); x0 += 48 \n\
	.endm	                                                                                     \n\
		                                                                                     \n\
	// c = SIMD:cross(a,b)                                                                       \n\
	//   ax:[x7] * 4                                                                             \n\
	//   ay:[x8] * 4                                                                             \n\
	//   az:[x9] * 4                                                                             \n\
	//   bx:[x10] * 4                                                                            \n\
	//   by:[x11] * 4                                                                            \n\
	//   bz:[x12] * 4                                                                            \n\
	//   cx:[x13] * 4                                                                            \n\
	//   cy:[x14] * 4                                                                            \n\
	//   cz:[x15] * 4                                                                            \n\
	.macro	VecCp4SOA                                                                            \n\
	ld1	{v0.4s}, [x7], 16           // v0: ax                                                \n\
	ld1	{v1.4s}, [x8], 16           // v1: ay                                                \n\
	ld1	{v2.4s}, [x9], 16           // v2: az                                                \n\
	ld1	{v3.4s}, [x10], 16          // v3: bx                                                \n\
	ld1	{v4.4s}, [x11], 16          // v4: by                                                \n\
	ld1	{v5.4s}, [x12], 16          // v5: bz                                                \n\
	VecCp4                              // c = cross(a,b)                                        \n\
	st1	{v19.4s}, [x13], 16         // [x13]: cx                                             \n\
	st1	{v20.4s}, [x14], 16         // [x14]: cx                                             \n\
	st1	{v21.4s}, [x15], 16         // [x15]: cy                                             \n\
	.endm	                                                                                     \n\
		                                                                                     \n\
"


void CrossProdAOS_(Vector* c, const Vector* a, const Vector* b, int n) {
  __asm volatile (ASM_MACROS "\n\
		                                                                                     \n\
	cmp	x3, 16                      // if n < 0                                              \n\
	b.lo	LSkipLoop1A                 //   goto SkipLoop1A                                     \n\
		                                                                                     \n\
LLoop1A:	                                                                                     \n\
	VecCp4AOS                                                                                    \n\
	VecCp4AOS                                                                                    \n\
	VecCp4AOS                                                                                    \n\
	VecCp4AOS                                                                                    \n\
	sub	x3, x3, 16                  // n -= 16                                               \n\
	cmp	x3, 16                      // if n >= 16                                            \n\
	b.hs	LLoop1A                     //   goto Loop1A                                         \n\
LSkipLoop1A:	                                                                                     \n\
	cbz	x3, LDoneA                                                                           \n\
LLoop2A:	                                                                                     \n\
	ldp	s0, s1, [x1], 8            // (s0, s1) = (ax, ay); x1 += 8                           \n\
	ldr	s2, [x1], 4                // s2 = az; x1 += 4       b                               \n\
	ldp	s3, s4, [x2], 8            // (s3, s4) = (bx, by); x1 += 8                           \n\
	ldr	s5, [x2], 4                // s5 = bz; x1 += 4                                       \n\
	VecCp	                                                                                     \n\
	stp	s19, s20, [x0], 8          // [x0] = (cx, cy); x0 += 8                               \n\
	str	s21, [x0], 4               // [x0] = cz; x0 += 4                                     \n\
	subs	x3, x3, 1                  // if --x3 != 0                                           \n\
	b.ne	LLoop2A                    //   goto Loop2A                                          \n\
LDoneA:	                                                                                             \n\
"
		  :
		  :
 		  : "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", "x0", "x1", "x2", "x3"
		  );
}

void CrossProdSOA_(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n) {
  __asm volatile ("\n\
		                                                                                     \n\
	ldp	x7, x8, [x1], 16            // (x7, x8)  = address of (ax, ay)                       \n\
	ldr	x9, [x1]                    // x9 = address of az                                    \n\
	ldp	x10, x11, [x2], 16          // (x10, x11)  = address of (bx, by)                     \n\
	ldr	x12, [x2]                   // x12 = address of bz                                   \n\
	ldp	x13, x14, [x0], 16          // (x13, x14)  = address of (cx, cy)                     \n\
	ldr	x15, [x0]                   // x15 = address of cz                                   \n\
		                                                                                     \n\
	cmp	x3, 16                      // if n < 0                                              \n\
	b.lo	LSkipLoop1B                 //   goto SkipLoop1A                                     \n\
		                                                                                     \n\
LLoop1B:	                                                                                     \n\
	VecCp4SOA                                                                                    \n\
	VecCp4SOA                                                                                    \n\
	VecCp4SOA                                                                                    \n\
	VecCp4SOA                                                                                    \n\
	sub	x3, x3, 16                  // n -= 16                                               \n\
	cmp	x3, 16                      // if n >= 16                                            \n\
	b.hs	LLoop1B                     //   goto Loop1B                                         \n\
LSkipLoop1B:	                                                                                     \n\
	cbz	x3, LDoneB                                                                           \n\
LLoop2B:	                                                                                     \n\
	ldr	s0, [x7], 4                // s0 = ax; x7 += 4                                       \n\
	ldr	s1, [x8], 4                // s1 = ay; x8 += 4                                       \n\
	ldr	s2, [x9], 4                // s2 = az; x9 += 4                                       \n\
	ldr	s3, [x10], 4               // s3 = bx; x10 += 4                                      \n\
	ldr	s4, [x11], 4               // s4 = by; x11 += 4                                      \n\
	ldr	s5, [x12], 4               // s5 = bz; x12 += 4                                      \n\
	VecCp	                                                                                     \n\
	str	s19, [x13], 4              // [x13] = cx; x13 += 4                                   \n\
	str	s20, [x14], 4              // [x14] = cy; x14 += 4                                   \n\
	str	s21, [x15], 4              // [x15] = cz; x15 += 4                                   \n\
	subs	x3, x3, 1                  // if --n != 0                                            \n\
	b.ne	LLoop2B                    //   goto Loop2B                                          \n\
LDoneB:	                                                                                             \n\
"
		  :
		  :
 		  : "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", 
		  "x0", "x1", "x2", "x3", 
		  "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15"
		  );
}
ch16_02/main.cpp の実行例
arm64@manet ch16_02 % g++ -I.. -I. -std=c++11 -O -S neon.cpp
arm64@manet ch16_02 % g++ -I.. -I. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet ch16_02 % ./a.out
Results for CrossProd
Vector cross product #0
  a:         86.0    58.0     1.0   b:    93.0    46.0    50.0
  c:       2854.0 -4207.0 -1438.0
Vector cross product #1
  a:         95.0    87.0    45.0   b:    56.0    89.0    19.0
  c:      -2352.0   715.0  3583.0
Vector cross product #2
  a:         63.0    92.0    30.0   b:    25.0    17.0    54.0
  c:       4458.0 -2652.0 -1229.0
Vector cross product #3
  a:         22.0    94.0    25.0   b:    42.0    34.0    28.0
  c:       1782.0   434.0 -3200.0
Vector cross product #4
  a:         13.0    71.0    71.0   b:    12.0    20.0    95.0
  c:       5325.0  -383.0  -592.0
Vector cross product #5
  a:         34.0     8.0     2.0   b:    76.0    49.0    20.0
  c:         62.0  -528.0  1058.0
Vector cross product #6
  a:         98.0    27.0    67.0   b:    39.0    73.0    15.0
  c:      -4486.0  1143.0  6101.0
Vector cross product #7
  a:         49.0   100.0    64.0   b:    94.0    17.0    91.0
  c:       8012.0  1557.0 -8567.0
Vector cross product #8
  a:         50.0    17.0    51.0   b:    12.0    95.0    98.0
  c:      -3179.0 -4288.0  4546.0
Vector cross product #9
  a:         55.0    53.0    94.0   b:     3.0    81.0    16.0
  c:      -6766.0  -598.0  4296.0
Vector cross product #10
  a:          6.0    50.0    39.0   b:    24.0    82.0    88.0
  c:       1202.0   408.0  -708.0
Vector cross product #11
  a:         15.0    72.0    86.0   b:    86.0    38.0    57.0
  c:        836.0  6541.0 -5622.0
Vector cross product #12
  a:         71.0    42.0    22.0   b:    84.0    13.0    46.0
  c:       1646.0 -1418.0 -2605.0
Vector cross product #13
  a:         26.0    11.0    37.0   b:    91.0    77.0    15.0
  c:      -2684.0  2977.0  1001.0
Vector cross product #14
  a:         20.0    58.0    83.0   b:    97.0    14.0    78.0
  c:       3362.0  6491.0 -5346.0
Vector cross product #15
  a:         91.0    16.0    41.0   b:    53.0    43.0    54.0
  c:       -899.0 -2741.0  3065.0
Vector cross product #16
  a:         77.0    54.0    12.0   b:    65.0    46.0    94.0
  c:       4524.0 -6458.0    32.0
Vector cross product #17
  a:         20.0    34.0    79.0   b:    64.0    24.0    38.0
  c:       -604.0  4296.0 -1696.0
arm64@manet ch16_02 %


http://nw.tsuda.ac.jp/