Modern Arm Assembly Language Programming: Covers Armv8-A 32-bit, 64-bit, and SIMD

by Daniel Kusswurm
2021.07.28: updated by
Up

Chapter 15: Armv8-64 SIMD Floating-Point Programming

Packed Floating-Point Arithmetic

Conversions

NEON を使って型変換を行うプログラムの説明。

ch15_03/main.cpp
#include <iostream>
#include <iomanip>
#include <cmath>
#include "Vec128.h"
using namespace std;

extern void F32fromI32(Vec128 x[2], const Vec128& a);
extern void I32fromF32(Vec128 x[2], const Vec128& a);
extern void F64fromI64(Vec128 x[2], const Vec128& a);
extern void I64fromF64(Vec128 x[2], const Vec128& a);
extern void F32fromU32(Vec128 x[2], const Vec128& a);
extern void U32fromF32(Vec128 x[2], const Vec128& a);
extern void F64fromU64(Vec128 x[2], const Vec128& a);
extern void U64fromF64(Vec128 x[2], const Vec128& a);
extern void F32fromF64(Vec128 x[2], const Vec128& a, const Vec128& b);
extern void F64fromF32(Vec128 x[2], const Vec128& a);

void PackedConvertA(void) {
    const char nl = '\n';
    Vec128 x[2], a;

    // F32_I32
    a.m_I32[0] = 10;
    a.m_I32[1] = -500;
    a.m_I32[2] = 600;
    a.m_I32[3] = -1024;
    F32fromI32(x, a);
    cout << "\nResults for CvtOp::F32_I32\n";
    cout << "a:    " << a.ToStringI32() << nl;
    cout << "x[0]: " << x[0].ToStringF32() << nl;

    // I32_F32
    a.m_F32[0] = -1.25f;
    a.m_F32[1] = 100.875f;
    a.m_F32[2] = -200.0f;
    a.m_F32[3] = (float)M_PI;
    I32fromF32(x, a);
    cout << "\nResults for CvtOp::I32_F32\n";
    cout << "a:    " << a.ToStringF32() << nl;
    cout << "x[0]: " << x[0].ToStringI32() << nl;

    // F64_I64
    a.m_I64[0] = 1000;
    a.m_I64[1] = -500000000000;
    F64fromI64(x, a);
    cout << "\nResults for CvtOp::F64_I64\n";
    cout << "a:    " << a.ToStringI64() << nl;
    cout << "x[0]: " << x[0].ToStringF64() << nl;

    // I64_F64
    a.m_F64[0] = -122.66666667;
    a.m_F64[1] = 1234567890123.75;
    I64fromF64(x, a);
    cout << "\nResults for CvtOp::I64_F64\n";
    cout << "a:    " << a.ToStringF64() << nl;
    cout << "x[0]: " << x[0].ToStringI64() << nl;
}

void PackedConvertB(void)
{
    const char nl = '\n';
    Vec128 x[2], a;

    // F32_U32
    a.m_U32[0] = 10;
    a.m_U32[1] = 500;
    a.m_U32[2] = 600;
    a.m_U32[3] = 1024;
    F32fromU32(x, a);
    cout << "\nResults for CvtOp::F32_U32\n";
    cout << "a:    " << a.ToStringU32() << nl;
    cout << "x[0]: " << x[0].ToStringF32() << nl;

    // U32_F32
    a.m_F32[0] = 1.25f;
    a.m_F32[1] = 100.875f;
    a.m_F32[2] = 200.0f;
    a.m_F32[3] = (float)M_PI;
    U32fromF32(x, a);
    cout << "\nResults for CvtOp::U32_F32\n";
    cout << "a:    " << a.ToStringF32() << nl;
    cout << "x[0]: " << x[0].ToStringU32() << nl;

    // F64_U64
    a.m_I64[0] = 1000;
    a.m_I64[1] = 420000000000;
    F64fromU64(x, a);
    cout << "\nResults for CvtOp::F64_U64\n";
    cout << "a:    " << a.ToStringU64() << nl;
    cout << "x[0]: " << x[0].ToStringF64() << nl;

    // U64_F64
    a.m_F64[0] = 698.40;
    a.m_F64[1] = 1234567890123.75;
    U64fromF64(x, a);
    cout << "\nResults for CvtOp::U64_F64\n";
    cout << "a:    " << a.ToStringF64() << nl;
    cout << "x[0]: " << x[0].ToStringU64() << nl;
}

void PackedConvertC(void)
{
    const char nl = '\n';
    Vec128 x[2], a, b;

    // F32_F64
    a.m_F64[0] = M_PI;
    a.m_F64[1] = M_LOG10E;
    b.m_F64[0] = -M_E;
    b.m_F64[1] = M_LN2;
    F32fromF64(x, a, b);
    cout << "\nResults for CvtOp::F32_F64\n";
    cout << "a:    " << a.ToStringF64() << nl;
    cout << "b:    " << b.ToStringF64() << nl;
    cout << "x[0]: " << x[0].ToStringF32() << nl;

    // F64_F32
    a.m_F32[0] = 1.0f / 9.0f;
    a.m_F32[1] = 100.875f;
    a.m_F32[2] = 200.0f;
    a.m_F32[3] = (float)M_SQRT2;
    F64fromF32(x, a);
    cout << "\nResults for CvtOp::F64_F32\n";
    cout << "a:    " << a.ToStringF32() << nl;
    cout << "x[0]: " << x[0].ToStringF64() << nl;
    cout << "x[1]: " << x[1].ToStringF64() << nl;
}

int main()
{
    PackedConvertA();
    PackedConvertB();
    PackedConvertC();
    return 0;
}
ch15_03/neon.cpp
#include "Vec128.h"

void F32fromI32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	scvtf	v1.4s, v0.4s	// float32 <- int32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void I32fromF32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	fcvtns	v1.4s, v0.4s	// int32 <- float32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void F64fromI64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	scvtf	v1.2d, v0.2d	// float64 <- int64   \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void I64fromF64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	fcvtns	v1.2d, v0.2d	// int32 <- float32   \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}


void F32fromU32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	ucvtf	v1.4s, v0.4s	// float32 <- int32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void U32fromF32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	fcvtnu	v1.4s, v0.4s	// uint32 <- float32  \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void F64fromU64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	ucvtf	v1.2d, v0.2d	// float64 <- int64   \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void U64fromF64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	fcvtnu	v1.2d, v0.2d	// uint64 <- float64  \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}


void F32fromF64(Vec128 x[2], const Vec128& a, const Vec128& b) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	ld1	{v2.2d}, [x2]                         \n\
	fcvtn	v1.2s, v0.2d	// lower-order F32    \n\
	fcvtn2	v1.4s, v2.2d	// higher-order F32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "v2", "x0", "x1"
		  );

}

void F64fromF32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	fcvtl	v1.2d, v0.2s	// lower-order F32    \n\
	fcvtl2	v2.2d, v0.4s	// higher-order F32   \n\
	st1	{v1.2d, v2.2d}, [x0]   // [x0] = v1   \n\
	"
	:
	:
	: "v0", "v1", "v2", "x0", "x1"
		  );

}
ch15_03/main.cpp の実行例
arm64@manet ch15_03 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet ch15_03 % ./a.out

Results for CvtOp::F32_I32
a:                  10            -500  |             600           -1024
x[0]:        10.000000     -500.000000  |      600.000000    -1024.000000

Results for CvtOp::I32_F32
a:           -1.250000      100.875000  |     -200.000000        3.141593
x[0]:               -1             101  |            -200               3

Results for CvtOp::F64_I64
a:                                1000  |                   -500000000000
x[0]:                1000.000000000000  |      -500000000000.000000000000

Results for CvtOp::I64_F64
a:                   -122.666666670000  |      1234567890123.750000000000
x[0]:                             -123  |                   1234567890124

Results for CvtOp::F32_U32
a:                  10             500  |             600            1024
x[0]:        10.000000      500.000000  |      600.000000     1024.000000

Results for CvtOp::U32_F32
a:            1.250000      100.875000  |      200.000000        3.141593
x[0]:                1             101  |             200               3

Results for CvtOp::F64_U64
a:                                1000  |                    420000000000
x[0]:                1000.000000000000  |       420000000000.000000000000

Results for CvtOp::U64_F64
a:                    698.400000000000  |      1234567890123.750000000000
x[0]:                              698  |                   1234567890124

Results for CvtOp::F32_F64
a:                      3.141592653590  |                  0.434294481903
b:                     -2.718281828459  |                  0.693147180560
x[0]:         3.141593        0.434294  |       -2.718282        0.693147

Results for CvtOp::F64_F32
a:            0.111111      100.875000  |      200.000000        1.414214
x[0]:                   0.111111111939  |                100.875000000000
x[1]:                 200.000000000000  |                  1.414213538170



http://nw.tsuda.ac.jp/