Modern Arm Assembly Language Programming: Covers Armv8-A 32-bit, 64-bit, and SIMD

by Daniel Kusswurm
2021.07.28: updated by

Chapter 15: Armv8-64 SIMD Floating-Point Programming

Packed Floating-Point Matrices

Matrix Trace and Transposition

NEON を使って4x4行列の転置行列求めるプログラムの説明


#include <iostream>
#include <iomanip>
#include "MatrixF32.h"

using namespace std;

extern void Mat4x4TransposeF32_(float* m_des, const float* m_src1);

void Mat4x4TestF32(MatrixF32& m_src1) {
    const char nl = '\n';
    const size_t nr = m_src1.GetNumCols();
    const size_t nc = m_src1.GetNumRows();
    MatrixF32 m_des1(nr, nc);
    MatrixF32 m_des2(nr, nc);
    MatrixF32::Transpose(m_des1, m_src1);
    Mat4x4TransposeF32_(m_des2.Data(), m_src1.Data());
    cout << fixed << setprecision(1);
    m_src1.SetOstream(12, "  ");
    m_des1.SetOstream(12, "  ");
    m_des2.SetOstream(12, "  ");
    cout << "\nResults for Mat4x4TestF32\n";
    cout << "Matrix m_src1\n" << m_src1 << nl;
    cout << "Matrix m_des1 (transpose of m_src1)\n" << m_des1 << nl;
    cout << "Matrix m_des2 (transpose of m_src1)\n" << m_des2 << nl;
    if (m_des1 != m_des2)
        cout << "\nMatrix transpose compare failed\n";

void Mat4x4TestF32(void) {
    const size_t nr = 4;
    const size_t nc = 4;
    MatrixF32 m_src1(nr ,nc);
    const float src1_row0[] = { 10, 11, 12, 13 };
    const float src1_row1[] = { 20, 21, 22, 23 };
    const float src1_row2[] = { 30, 31, 32, 33 };
    const float src1_row3[] = { 40, 41, 42, 43 };
    m_src1.SetRow(0, src1_row0);
    m_src1.SetRow(1, src1_row1);
    m_src1.SetRow(2, src1_row2);
    m_src1.SetRow(3, src1_row3);

int main() {
  return 0;
#include "Vec128.h"

void Mat4x4TransposeF32_(float* m_des, const float* m_src1) {
  __asm volatile ("\n\
	ld1	{v0.4s-v3.4s}, [x1]                                 \n\
	trn1	v4.4s, v0.4s, v1.4s            // a0 b0 a2 b2       \n\
	trn2	v5.4s, v0.4s, v1.4s            // a1 b1 a3 b3       \n\
	trn1	v6.4s, v2.4s, v3.4s            // c0 d0 c2 d2       \n\
	trn2	v7.4s, v2.4s, v3.4s            // c1 d1 c3 d3       \n\
	trn1	v0.2d, v4.2d, v6.2d            // a0 b0 c0 d0       \n\
	trn1	v1.2d, v5.2d, v7.2d            // a1 b1 c1 d1       \n\
	trn2	v2.2d, v4.2d, v6.2d            // a2 b2 c2 d2       \n\
	trn2	v3.2d, v5.2d, v7.2d            // a3 b3 c3 d3       \n\
	st1	{v0.4s-v3.4s}, [x0]                                 \n\
	: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
ch15_07/main.cpp の実行例
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_07 % ./a.out

Results for Mat4x4TestF32
Matrix m_src1
        10.0          11.0          12.0          13.0
        20.0          21.0          22.0          23.0
        30.0          31.0          32.0          33.0
        40.0          41.0          42.0          43.0

Matrix m_des1 (transpose of m_src1)
        10.0          20.0          30.0          40.0
        11.0          21.0          31.0          41.0
        12.0          22.0          32.0          42.0
        13.0          23.0          33.0          43.0

Matrix m_des2 (transpose of m_src1)
        10.0          20.0          30.0          40.0
        11.0          21.0          31.0          41.0
        12.0          22.0          32.0          42.0
        13.0          23.0          33.0          43.0

arm64@manet Ch15_07 %