Modern Arm Assembly Language Programming: Covers Armv8-A 32-bit, 64-bit, and SIMD

by Daniel Kusswurm
2021.07.28: updated by
Up

Chapter 15: Armv8-64 SIMD Floating-Point Programming

Packed Floating-Point Matrices

Matrix Multiplication

NEON を使って4x4行列の乗算を行うプログラムの説明




ch15_06/main.cpp
#include <iostream>
#include <iomanip>
#include "MatrixF32.h"

using namespace std;

extern void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2);

void Mat4x4MulF32(MatrixF32& m_src1, MatrixF32& m_src2) {
    const size_t nr = m_src1.GetNumRows();
    const size_t nc = m_src2.GetNumCols();
    MatrixF32 m_des1(nr ,nc);
    MatrixF32 m_des2(nr ,nc);
    MatrixF32::Mul4x4(m_des1, m_src1, m_src2);
    Mat4x4MulF32_(m_des2.Data(), m_src1.Data(), m_src2.Data());
    cout << fixed << setprecision(1);
    m_src1.SetOstream(12, "  ");
    m_src2.SetOstream(12, "  ");
    m_des1.SetOstream(12, "  ");
    m_des2.SetOstream(12, "  ");
    cout << "\nResults for Mat4x4MulF32\n";
    cout << "Matrix m_src1\n" << m_src1 << '\n';
    cout << "Matrix m_src2\n" << m_src2 << '\n';
    cout << "Matrix m_des1\n" << m_des1 << '\n';
    cout << "Matrix m_des2\n" << m_des2 << '\n';
    if (m_des1 != m_des2)
        cout << "\nMatrix compare failed - Mat4x4MulF32\n";
}

void Mat4x4MulF32Test(void) {
    const size_t nr = 4;
    const size_t nc = 4;
    MatrixF32 m_src1(nr ,nc);
    MatrixF32 m_src2(nr ,nc);
    const float src1_row0[] = { 10, 11, 12, 13 };
    const float src1_row1[] = { 20, 21, 22, 23 };
    const float src1_row2[] = { 30, 31, 32, 33 };
    const float src1_row3[] = { 40, 41, 42, 43 };
    const float src2_row0[] = { 100, 101, 102, 103 };
    const float src2_row1[] = { 200, 201, 202, 203 };
    const float src2_row2[] = { 300, 301, 302, 303 };
    const float src2_row3[] = { 400, 401, 402, 403 };
    m_src1.SetRow(0, src1_row0);
    m_src1.SetRow(1, src1_row1);
    m_src1.SetRow(2, src1_row2);
    m_src1.SetRow(3, src1_row3);
    m_src2.SetRow(0, src2_row0);
    m_src2.SetRow(1, src2_row1);
    m_src2.SetRow(2, src2_row2);
    m_src2.SetRow(3, src2_row3);
    Mat4x4MulF32(m_src1, m_src2);
}

int main() {
    Mat4x4MulF32Test();
    return 0;
}
ch15_06/neon.cpp
#include "Vec128.h"

void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2) {
  __asm volatile("\n\
	ld1	{v0.4s-v3.4s}, [x1]           // m_src1                                 \n\
	ld1	{v4.4s-v7.4s}, [x2]           // m_src2                                 \n\
	                                                                                \n\
	// Row 0                                                                        \n\
	fmul	v16.4s, v4.4s, v0.s[0]       // v16 = v4 * v0.lane0                     \n\
	fmla	v16.4s, v5.4s, v0.s[1]       // v16 += v5 * v0.lane1                    \n\
	fmla	v16.4s, v6.4s, v0.s[2]       // v16 += v6 * v0.lane2                    \n\
	fmla	v16.4s, v7.4s, v0.s[3]       // v16 += v6 * v0.lane3                    \n\
	st1	{v16.4s}, [x0], 16                                                      \n\
	                                                                                \n\
	// Row 1                                                                        \n\
	fmul	v17.4s, v4.4s, v1.s[0]       // v17 = v4 * v1.lane0                     \n\
	fmla	v17.4s, v5.4s, v1.s[1]       // v17 += v5 * v1.lane1                    \n\
	fmla	v17.4s, v6.4s, v1.s[2]       // v17 += v6 * v1.lane2                    \n\
	fmla	v17.4s, v7.4s, v1.s[3]       // v17 += v6 * v1.lane3                    \n\
	st1	{v17.4s}, [x0], 16                                                      \n\
	                                                                                \n\
	// Row 2                                                                        \n\
	fmul	v18.4s, v4.4s, v2.s[0]       // v18 = v4 * v2.lane0                     \n\
	fmla	v18.4s, v5.4s, v2.s[1]       // v18 += v5 * v2.lane1                    \n\
	fmla	v18.4s, v6.4s, v2.s[2]       // v18 += v6 * v2.lane2                    \n\
	fmla	v18.4s, v7.4s, v2.s[3]       // v18 += v6 * v2.lane3                    \n\
	st1	{v18.4s}, [x0], 16                                                      \n\
	                                                                                \n\
	// Row 3                                                                        \n\
	fmul	v19.4s, v4.4s, v3.s[0]       // v19 = v4 * v3.lane0                     \n\
	fmla	v19.4s, v5.4s, v3.s[1]       // v19 += v5 * v3.lane1                    \n\
	fmla	v19.4s, v6.4s, v3.s[2]       // v19 += v6 * v3.lane2                    \n\
	fmla	v19.4s, v7.4s, v3.s[3]       // v19 += v6 * v3.lane3                    \n\
	st1	{v19.4s}, [x0], 16                                                      \n\
"
		 :
		 :
		 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "x0"
		 );
}
ch15_06/main.cpp の実行例
arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_06 % ./a.out

Results for Mat4x4MulF32
Matrix m_src1
        10.0          11.0          12.0          13.0
        20.0          21.0          22.0          23.0
        30.0          31.0          32.0          33.0
        40.0          41.0          42.0          43.0

Matrix m_src2
       100.0         101.0         102.0         103.0
       200.0         201.0         202.0         203.0
       300.0         301.0         302.0         303.0
       400.0         401.0         402.0         403.0

Matrix m_des1
     12000.0       12046.0       12092.0       12138.0
     22000.0       22086.0       22172.0       22258.0
     32000.0       32126.0       32252.0       32378.0
     42000.0       42166.0       42332.0       42498.0

Matrix m_des2
     12000.0       12046.0       12092.0       12138.0
     22000.0       22086.0       22172.0       22258.0
     32000.0       32126.0       32252.0       32378.0
     42000.0       42166.0       42332.0       42498.0

arm64@manet Ch15_06 % 


http://nw.tsuda.ac.jp/