From 3f7f4c0a52894280438f095d8d2e564d2d67dfee Mon Sep 17 00:00:00 2001 From: Francisco Javier Trujillo Mata Date: Wed, 5 Nov 2025 11:29:55 +0100 Subject: [PATCH] Use VU0_VECTORs and ASM code cmake: enable VU0/ASM options and wire test execution docs(vu0): add emulation caveats and tests README note (fobes.dev) docs(vu0): add copilot-instructions and consolidate VU0 sources tests: add VU mul emulator-detection helper and global flag tmp --- .github/workflows/compilation.yml | 5 +- .gitignore | 2 +- CMAKE_BUILD.md | 20 +- CMakeLists.txt | 18 +- include/ps2s/cpu_compat_types.h | 865 ++++++++++ include/ps2s/cpu_matrix.h | 65 + include/ps2s/cpu_vector.h | 86 +- include/ps2s/matrix.h | 1340 ++++++++------- include/ps2s/matrix_common.h | 870 ++++++---- include/ps2s/vector.h | 273 ++-- include/ps2s/vector_common.h | 733 +++++---- include/ps2s/vu.h | 168 +- tests/CMakeLists.txt | 55 +- tests/README.md | 85 + tests/test_shared.cpp | 2546 +++++++++++++++++++++++++++++ 15 files changed, 5663 insertions(+), 1468 deletions(-) create mode 100644 include/ps2s/cpu_compat_types.h create mode 100644 tests/README.md create mode 100644 tests/test_shared.cpp diff --git a/.github/workflows/compilation.yml b/.github/workflows/compilation.yml index ec5b372..a78b2b8 100644 --- a/.github/workflows/compilation.yml +++ b/.github/workflows/compilation.yml @@ -19,11 +19,11 @@ jobs: apk update apk add cmake build-base git make - - name: Configure with CMake + - name: Configure with CMake (library + tests) run: | mkdir build cd build - cmake .. + cmake -DBUILD_TESTS=ON .. - name: Build project with CMake run: | @@ -46,3 +46,4 @@ jobs: name: libps2stuff-${{ steps.slug.outputs.sha8 }} path: | build/*.a + build/tests/*.elf diff --git a/.gitignore b/.gitignore index fdcc877..76f6fc6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ objs_* prebuilddone *.o *.a - # CMake build/ CMakeCache.txt @@ -12,3 +11,4 @@ cmake_install.cmake install_manifest.txt *.cmake !CMakeLists.txt +*.elf diff --git a/CMAKE_BUILD.md b/CMAKE_BUILD.md index 968bb4a..cba521c 100644 --- a/CMAKE_BUILD.md +++ b/CMAKE_BUILD.md @@ -30,13 +30,19 @@ make ### Building with Tests -To build the test executables (when available): +To build the test executables: ```bash cmake -DBUILD_TESTS=ON .. make ``` +If `ps2client` is installed and available in `PATH`, you can run: + +```bash +make run_test_vu0 +``` + ## Installing To install the library and headers to `$PS2SDK/ports`: @@ -57,16 +63,22 @@ The following CMake options are available: |--------|---------|-------------| | `DEBUG` | OFF | Enable debug build with `_DEBUG` definition | | `BUILD_TESTS` | OFF | Build test executables | +| `ENABLE_VU0_VECTORS` | ON | Enable VU0 vector code paths | +| `ENABLE_ASM` | ON | Enable assembly optimizations | ## Build Flags -The CMake build automatically applies the following flags: +The CMake build automatically applies the following warning flags: -- `-DNO_VU0_VECTORS` - Disables VU0 vector code (currently broken) -- `-DNO_ASM` - Disables assembly optimizations - `-Wno-strict-aliasing` - Suppresses strict aliasing warnings - `-Wno-conversion-null` - Suppresses conversion null warnings +To disable VU0 vectors or assembly paths, configure with: + +```bash +cmake -DENABLE_VU0_VECTORS=OFF -DENABLE_ASM=OFF .. +``` + ## CMake Toolchain The build uses the PS2DEV CMake toolchain file located at: diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f8b1c3..4fbbd84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,8 @@ project(ps2stuff VERSION 1.0.0 LANGUAGES CXX C) # Options option(BUILD_TESTS "Build test projects" OFF) option(DEBUG "Enable debug build" OFF) +option(ENABLE_VU0_VECTORS "Enable VU0 vector code paths" ON) +option(ENABLE_ASM "Enable assembly optimizations" ON) # Check if PS2SDK is set (should be done by toolchain file) if(NOT DEFINED PS2SDK) @@ -45,11 +47,13 @@ set(WARNING_FLAGS -Wno-conversion-null ) -# VU0 code is broken so disable for now -add_compile_definitions( - NO_VU0_VECTORS - NO_ASM -) +if(NOT ENABLE_VU0_VECTORS) + add_compile_definitions(NO_VU0_VECTORS) +endif() + +if(NOT ENABLE_ASM) + add_compile_definitions(NO_ASM) +endif() add_compile_options(${WARNING_FLAGS}) @@ -122,6 +126,6 @@ message(STATUS " Debug build: ${DEBUG}") message(STATUS " Build tests: ${BUILD_TESTS}") message(STATUS " Output library: ${EE_LIB}") message(STATUS " Install prefix: ${PS2SDK}/ports") -message(STATUS " VU0 vectors: DISABLED") -message(STATUS " ASM optimizations: DISABLED") +message(STATUS " VU0 vectors: ${ENABLE_VU0_VECTORS}") +message(STATUS " ASM optimizations: ${ENABLE_ASM}") message(STATUS "") diff --git a/include/ps2s/cpu_compat_types.h b/include/ps2s/cpu_compat_types.h new file mode 100644 index 0000000..9b7572e --- /dev/null +++ b/include/ps2s/cpu_compat_types.h @@ -0,0 +1,865 @@ +/* + * cpu_compat_types.h - CPU-only implementations of VU0 vector/matrix types. + * + * Provides the same class names and API as vector.h/matrix.h but uses + * standard C++ float arithmetic. Compile with -DUSE_CPU_COMPAT to select + * this path; the 12 VU0-specific tests (accumulator + DMA copy) are then + * excluded from the test binary at compile time. + */ + +#pragma once + +#include + +/* ======================================================================== + * Forward declarations + * ======================================================================== */ + +struct vec_xyz; +struct vec_xyzw; +struct mat_33; +struct mat_34; +struct mat_43; +struct mat_44; +struct transform_t; + +/* ======================================================================== + * Single-component wrappers (vec_x, vec_y, vec_z, vec_w) + * + * Used by ASSERT_VEC_EQ: (float)vec_x(v) extracts the x component and + * converts to float. + * ======================================================================== */ + +struct vec_x { + float v; + explicit vec_x(float f) : v(f) {} + vec_x(const vec_xyz& t); + vec_x(const vec_xyzw& t); + operator float() const { return v; } +}; + +struct vec_y { + float v; + explicit vec_y(float f) : v(f) {} + vec_y(const vec_xyz& t); + vec_y(const vec_xyzw& t); + operator float() const { return v; } +}; + +struct vec_z { + float v; + explicit vec_z(float f) : v(f) {} + vec_z(const vec_xyz& t); + vec_z(const vec_xyzw& t); + operator float() const { return v; } +}; + +struct vec_w { + float v; + explicit vec_w(float f) : v(f) {} + vec_w(const vec_xyz& t); + vec_w(const vec_xyzw& t); + operator float() const { return v; } +}; + +/* ======================================================================== + * vec_xyz (3D float vector) + * ======================================================================== */ + +struct vec_xyz { + float x, y, z; + + vec_xyz() : x(0.0f), y(0.0f), z(0.0f) {} + vec_xyz(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {} + // Conversion from vec_xyzw (drops w) + vec_xyz(const vec_xyzw& v); + + void set(float s) { x = s; y = s; z = s; } + void set(const vec_xyz& v) { x = v.x; y = v.y; z = v.z; } + void set(float x_, float y_, float z_) { x = x_; y = y_; z = z_; } + void set_zero() { x = 0.0f; y = 0.0f; z = 0.0f; } + + vec_xyz operator+(const vec_xyz& r) const { return {x+r.x, y+r.y, z+r.z}; } + vec_xyz operator-(const vec_xyz& r) const { return {x-r.x, y-r.y, z-r.z}; } + vec_xyz operator*(const vec_xyz& r) const { return {x*r.x, y*r.y, z*r.z}; } + vec_xyz operator/(const vec_xyz& r) const { return {x/r.x, y/r.y, z/r.z}; } + vec_xyz operator*(float s) const { return {x*s, y*s, z*s}; } + vec_xyz operator/(float s) const { float r=1.0f/s; return {x*r, y*r, z*r}; } + vec_xyz operator-() const { return {-x, -y, -z}; } + + vec_xyz& operator+=(const vec_xyz& r) { x+=r.x; y+=r.y; z+=r.z; return *this; } + vec_xyz& operator-=(const vec_xyz& r) { x-=r.x; y-=r.y; z-=r.z; return *this; } + vec_xyz& operator*=(const vec_xyz& r) { x*=r.x; y*=r.y; z*=r.z; return *this; } + + vec_xyz abs() const { return {fabsf(x), fabsf(y), fabsf(z)}; } + vec_xyz max(const vec_xyz& r) const { + return {x > r.x ? x : r.x, + y > r.y ? y : r.y, + z > r.z ? z : r.z}; + } + vec_xyz min(const vec_xyz& r) const { + return {x < r.x ? x : r.x, + y < r.y ? y : r.y, + z < r.z ? z : r.z}; + } + vec_xyz sign() const { + return {x > 0.0f ? 1.0f : (x < 0.0f ? -1.0f : 0.0f), + y > 0.0f ? 1.0f : (y < 0.0f ? -1.0f : 0.0f), + z > 0.0f ? 1.0f : (z < 0.0f ? -1.0f : 0.0f)}; + } + vec_xyz one_over() const { return {1.0f/x, 1.0f/y, 1.0f/z}; } + + float dot(const vec_xyz& r) const { return x*r.x + y*r.y + z*r.z; } + float length_sqr() const { return x*x + y*y + z*z; } + float length() const { return sqrtf(length_sqr()); } + + vec_xyz normalized() const { + float len = length(); + if (len == 0.0f) return {0.0f, 0.0f, 0.0f}; + float inv = 1.0f / len; + return {x*inv, y*inv, z*inv}; + } + vec_xyz& normalize() { *this = normalized(); return *this; } + + vec_xyz cross(const vec_xyz& r) const { + return {y*r.z - z*r.y, + z*r.x - x*r.z, + x*r.y - y*r.x}; + } + + int is_zero() const { return (x == 0.0f && y == 0.0f && z == 0.0f) ? 1 : 0; } + + vec_xyz set_length(float len) const { return normalized() * len; } + + vec_xyz truncate_length(float maxLen) const { + float len = length(); + if (len <= maxLen || len == 0.0f) return *this; + float scale = maxLen / len; + return {x*scale, y*scale, z*scale}; + } + + vec_xyz interpolate(float alpha, const vec_xyz& other) const { + return {x + alpha*(other.x - x), + y + alpha*(other.y - y), + z + alpha*(other.z - z)}; + } + + float distance_from(const vec_xyz& r) const { return (*this - r).length(); } + + // Parallel component of *this along unit vector basis + vec_xyz parallel_component(const vec_xyz& unitBasis) const { + return unitBasis * dot(unitBasis); + } + // Perpendicular component of *this to unit vector basis + vec_xyz perpendicular_component(const vec_xyz& unitBasis) const { + return *this - parallel_component(unitBasis); + } + + // Index access + float& operator[](int i) { return (&x)[i]; } + float operator[](int i) const { return (&x)[i]; } +}; + +// Scalar-first multiply +inline vec_xyz operator*(float s, const vec_xyz& v) { return v * s; } + +typedef vec_xyz vec_3; + +/* ======================================================================== + * vec_xyzw (4D float vector) + * ======================================================================== */ + +struct vec_xyzw { + float x, y, z, w; + + vec_xyzw() : x(0.0f), y(0.0f), z(0.0f), w(0.0f) {} + vec_xyzw(float x_, float y_, float z_, float w_) : x(x_), y(y_), z(z_), w(w_) {} + // 3-arg ctor sets w=0 + vec_xyzw(float x_, float y_, float z_) : x(x_), y(y_), z(z_), w(0.0f) {} + // Construct from vec_xyz (w=0) + vec_xyzw(const vec_xyz& v) : x(v.x), y(v.y), z(v.z), w(0.0f) {} + + void set(float x_, float y_, float z_, float w_) { x=x_; y=y_; z=z_; w=w_; } + void set(float x_, float y_, float z_) { x=x_; y=y_; z=z_; } + void set_zero() { x=0.0f; y=0.0f; z=0.0f; w=0.0f; } + + vec_xyzw operator+(const vec_xyzw& r) const { return {x+r.x, y+r.y, z+r.z, w+r.w}; } + vec_xyzw operator-(const vec_xyzw& r) const { return {x-r.x, y-r.y, z-r.z, w-r.w}; } + vec_xyzw operator*(const vec_xyzw& r) const { return {x*r.x, y*r.y, z*r.z, w*r.w}; } + vec_xyzw operator*(float s) const { return {x*s, y*s, z*s, w*s}; } + vec_xyzw operator-() const { return {-x, -y, -z, -w}; } + + vec_xyzw& operator+=(const vec_xyzw& r) { x+=r.x; y+=r.y; z+=r.z; w+=r.w; return *this; } + vec_xyzw& operator-=(const vec_xyzw& r) { x-=r.x; y-=r.y; z-=r.z; w-=r.w; return *this; } + + float dot(const vec_xyzw& r) const { return x*r.x + y*r.y + z*r.z + w*r.w; } + float dot3(const vec_xyz& r) const { return x*r.x + y*r.y + z*r.z; } + + vec_xyzw normalized3() const { + float len = sqrtf(x*x + y*y + z*z); + if (len == 0.0f) return *this; + float inv = 1.0f / len; + return {x*inv, y*inv, z*inv, w}; + } + vec_xyzw& normalize3() { *this = normalized3(); return *this; } + + // Index access + float& operator[](int i) { return (&x)[i]; } + float operator[](int i) const { return (&x)[i]; } +}; + +inline vec_xyzw operator*(float s, const vec_xyzw& v) { return v * s; } + +typedef vec_xyzw vec_4; + +/* ======================================================================== + * vector_t (direction: treat w as 0) + * point_t (position: treat w as 1) + * Both stored as vec_xyz; w is implicit. + * ======================================================================== */ + +struct vector_t : public vec_xyz { + vector_t() : vec_xyz() {} + vector_t(float x_, float y_, float z_) : vec_xyz(x_, y_, z_) {} + vector_t(const vec_xyz& v) : vec_xyz(v) {} +}; + +struct point_t : public vec_xyz { + point_t() : vec_xyz() {} + point_t(float x_, float y_, float z_) : vec_xyz(x_, y_, z_) {} + point_t(const vec_xyz& v) : vec_xyz(v) {} +}; + +/* ======================================================================== + * Deferred single-component conversions (need full vec_xyz/vec_xyzw) + * ======================================================================== */ + +inline vec_x::vec_x(const vec_xyz& t) : v(t.x) {} +inline vec_x::vec_x(const vec_xyzw& t) : v(t.x) {} +inline vec_y::vec_y(const vec_xyz& t) : v(t.y) {} +inline vec_y::vec_y(const vec_xyzw& t) : v(t.y) {} +inline vec_z::vec_z(const vec_xyz& t) : v(t.z) {} +inline vec_z::vec_z(const vec_xyzw& t) : v(t.z) {} +inline vec_w::vec_w(const vec_xyz& /*t*/) : v(0.0f) {} +inline vec_w::vec_w(const vec_xyzw& t) : v(t.w) {} + +inline vec_xyz::vec_xyz(const vec_xyzw& v) : x(v.x), y(v.y), z(v.z) {} + +/* ======================================================================== + * mat_33 (3×3 column-major matrix) + * + * Storage: col0, col1, col2 (each vec_xyz = one column) + * Matrix M acts on column vector v as: M*v = col0*v.x + col1*v.y + col2*v.z + * ======================================================================== */ + +struct mat_33 { + vec_xyz col0, col1, col2; + + // Default constructor – initialises to identity + mat_33() { set_identity(); } + + // Construct from 3 column vectors + mat_33(const vec_xyz& c0, const vec_xyz& c1, const vec_xyz& c2) + : col0(c0), col1(c1), col2(c2) {} + + // Construct from unit quaternion (x,y,z,w) + explicit mat_33(const vec_xyzw& q) { + float qx = q.x, qy = q.y, qz = q.z, qw = q.w; + col0 = { 1.0f-2.0f*(qy*qy+qz*qz), 2.0f*(qx*qy+qz*qw), 2.0f*(qx*qz-qy*qw) }; + col1 = { 2.0f*(qx*qy-qz*qw), 1.0f-2.0f*(qx*qx+qz*qz), 2.0f*(qy*qz+qx*qw) }; + col2 = { 2.0f*(qx*qz+qy*qw), 2.0f*(qy*qz-qx*qw), 1.0f-2.0f*(qx*qx+qy*qy) }; + } + + void set_zero() { col0.set_zero(); col1.set_zero(); col2.set_zero(); } + void set_identity() { col0={1,0,0}; col1={0,1,0}; col2={0,0,1}; } + void set_scale(const vec_xyz& s) { + col0={s.x,0,0}; col1={0,s.y,0}; col2={0,0,s.z}; + } + + // Row access (each row is the i-th elements of all columns) + void set_row0(const vec_xyz& r) { col0.x=r.x; col1.x=r.y; col2.x=r.z; } + void set_row1(const vec_xyz& r) { col0.y=r.x; col1.y=r.y; col2.y=r.z; } + void set_row2(const vec_xyz& r) { col0.z=r.x; col1.z=r.y; col2.z=r.z; } + + vec_xyz get_row0() const { return {col0.x, col1.x, col2.x}; } + vec_xyz get_row1() const { return {col0.y, col1.y, col2.y}; } + vec_xyz get_row2() const { return {col0.z, col1.z, col2.z}; } + + vec_xyz get_col0() const { return col0; } + vec_xyz get_col1() const { return col1; } + vec_xyz get_col2() const { return col2; } + + // Matrix-vector product: M * v + vec_xyz operator*(const vec_xyz& v) const { + return col0*v.x + col1*v.y + col2*v.z; + } + + // Transpose-multiply: M^T * v = (dot(col0,v), dot(col1,v), dot(col2,v)) + vec_xyz trans_mult(const vec_xyz& v) const { + return {col0.dot(v), col1.dot(v), col2.dot(v)}; + } + + mat_33 transpose() const { + mat_33 r; + r.col0 = get_row0(); r.col1 = get_row1(); r.col2 = get_row2(); + return r; + } + + mat_33 operator-() const { + mat_33 r; r.col0=-col0; r.col1=-col1; r.col2=-col2; return r; + } + + mat_33 operator+(const mat_33& m) const { + mat_33 r; r.col0=col0+m.col0; r.col1=col1+m.col1; r.col2=col2+m.col2; return r; + } + mat_33 operator-(const mat_33& m) const { + mat_33 r; r.col0=col0-m.col0; r.col1=col1-m.col1; r.col2=col2-m.col2; return r; + } + + // Matrix product: (A*B).col_i = A * B.col_i + mat_33 operator*(const mat_33& m) const { + mat_33 r; + r.col0 = *this * m.col0; + r.col1 = *this * m.col1; + r.col2 = *this * m.col2; + return r; + } + + // Transpose-multiply: M^T * A + mat_33 trans_mult(const mat_33& m) const { + // (M^T * A).col_i = M^T * A.col_i = (dot(col0,A.col_i), dot(col1,A.col_i), dot(col2,A.col_i)) + mat_33 r; + r.col0 = trans_mult(m.col0); + r.col1 = trans_mult(m.col1); + r.col2 = trans_mult(m.col2); + return r; + } + + void set_rotate_x(float a) { + float c=cosf(a), s=sinf(a); + col0={1,0,0}; col1={0,c,s}; col2={0,-s,c}; + } + void set_rotate_y(float a) { + float c=cosf(a), s=sinf(a); + col0={c,0,-s}; col1={0,1,0}; col2={s,0,c}; + } + void set_rotate_z(float a) { + float c=cosf(a), s=sinf(a); + col0={c,s,0}; col1={-s,c,0}; col2={0,0,1}; + } + + // mult_tilde: M * [v]× where [v]× is the skew-symmetric matrix + // [v]×: col0=(0,vz,-vy), col1=(-vz,0,vx), col2=(vy,-vx,0) + mat_33 mult_tilde(const vec_xyz& v) const { + mat_33 r; + r.col0 = *this * vec_xyz( 0.0f, v.z, -v.y); + r.col1 = *this * vec_xyz(-v.z, 0.0f, v.x); + r.col2 = *this * vec_xyz( v.y, -v.x, 0.0f); + return r; + } + + // 3×3 inverse via cofactors (Cramer's rule). + // Column-major names: a=col0.x=M[0][0], d=col0.y=M[1][0], g=col0.z=M[2][0] + // b=col1.x=M[0][1], e=col1.y=M[1][1], h=col1.z=M[2][1] + // c=col2.x=M[0][2], f=col2.y=M[1][2], i=col2.z=M[2][2] + // inv.col_k = (C[0][k], C[1][k], C[2][k]) / det (adjugate row k) + mat_33 inverse() const { + float a=col0.x, d=col0.y, g=col0.z; + float b=col1.x, e=col1.y, h=col1.z; + float c=col2.x, f=col2.y, ii=col2.z; + float det = a*(e*ii - f*h) - b*(d*ii - f*g) + c*(d*h - e*g); + float inv_det = 1.0f / det; + mat_33 r; + r.col0.x = ( e*ii - f*h) * inv_det; + r.col0.y = -(d*ii - f*g) * inv_det; + r.col0.z = ( d*h - e*g) * inv_det; + r.col1.x = -(b*ii - c*h) * inv_det; + r.col1.y = ( a*ii - c*g) * inv_det; + r.col1.z = -(a*h - b*g) * inv_det; + r.col2.x = ( b*f - c*e) * inv_det; + r.col2.y = -(a*f - c*d) * inv_det; + r.col2.z = ( a*e - b*d) * inv_det; + return r; + } +}; + +// Skew-symmetric matrix operator: ~v creates [v]× +inline mat_33 operator~(const vec_xyz& v) { + mat_33 r; + r.col0 = { 0.0f, v.z, -v.y}; + r.col1 = {-v.z, 0.0f, v.x}; + r.col2 = { v.y, -v.x, 0.0f}; + return r; +} + +// Row-vector * matrix: v^T * M = M^T * v +inline vec_xyz operator*(const vec_xyz& v, const mat_33& m) { + return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2)}; +} + +/* ======================================================================== + * mat_43 (4 rows × 3 columns, column-major) + * + * Storage: col0, col1, col2 (each vec_xyzw = column with 4 floats) + * Product: mat_43 * vec_3 = vec_4 (result = col0*v.x + col1*v.y + col2*v.z) + * ======================================================================== */ + +struct mat_43 { + vec_xyzw col0, col1, col2; + + mat_43() { set_zero(); } + mat_43(const vec_xyzw& c0, const vec_xyzw& c1, const vec_xyzw& c2) + : col0(c0), col1(c1), col2(c2) {} + + void set_zero() { col0.set_zero(); col1.set_zero(); col2.set_zero(); } + + vec_xyz get_row0() const { return {col0.x, col1.x, col2.x}; } + vec_xyz get_row1() const { return {col0.y, col1.y, col2.y}; } + vec_xyz get_row2() const { return {col0.z, col1.z, col2.z}; } + vec_xyz get_row3() const { return {col0.w, col1.w, col2.w}; } + + vec_xyzw get_col0() const { return col0; } + vec_xyzw get_col1() const { return col1; } + vec_xyzw get_col2() const { return col2; } + + // mat_43 * vec_3 = vec_4 + vec_xyzw operator*(const vec_xyz& v) const { + vec_xyzw r; + r.x = col0.x*v.x + col1.x*v.y + col2.x*v.z; + r.y = col0.y*v.x + col1.y*v.y + col2.y*v.z; + r.z = col0.z*v.x + col1.z*v.y + col2.z*v.z; + r.w = col0.w*v.x + col1.w*v.y + col2.w*v.z; + return r; + } + + // Transpose-multiply: (mat_43)^T * vec_4 = vec_3 + // (mat_43^T).row_i = col_i, so result_i = dot(col_i, v) + vec_xyz trans_mult(const vec_xyzw& v) const { + return {col0.dot(v), col1.dot(v), col2.dot(v)}; + } + vec_xyz trans_mult(const vector_t& v) const { + vec_xyzw v4(v.x, v.y, v.z, 0.0f); + return trans_mult(v4); + } + vec_xyz trans_mult(const point_t& v) const { + vec_xyzw v4(v.x, v.y, v.z, 1.0f); + return trans_mult(v4); + } + + mat_34 transpose() const; // defined after mat_34 is complete + + mat_43 operator-() const { + return {-col0, -col1, -col2}; + } + mat_43 operator+(const mat_43& m) const { + return {col0+m.col0, col1+m.col1, col2+m.col2}; + } + mat_43 operator-(const mat_43& m) const { + return {col0-m.col0, col1-m.col1, col2-m.col2}; + } + + // mat_43 * mat_33 = mat_43 + // result.col_i = (mat_43) * (mat_33).col_i + mat_43 operator*(const mat_33& m) const { + mat_43 r; + r.col0 = *this * m.col0; + r.col1 = *this * m.col1; + r.col2 = *this * m.col2; + return r; + } + + // mult_tilde: (mat_43) * [v]× where [v]× cols are vec_3 + // [v]×: col0=(0,vz,-vy), col1=(-vz,0,vx), col2=(vy,-vx,0) + mat_43 mult_tilde(const vec_xyz& v) const { + mat_43 r; + r.col0 = *this * vec_xyz( 0.0f, v.z, -v.y); + r.col1 = *this * vec_xyz(-v.z, 0.0f, v.x); + r.col2 = *this * vec_xyz( v.y, -v.x, 0.0f); + return r; + } +}; + +/* ======================================================================== + * mat_34 (3 rows × 4 columns, column-major) + * + * Storage: col0, col1, col2, col3 (each vec_xyz = column with 3 floats) + * Product: mat_34 * vec_4 = vec_3 (result = col0*v.x + col1*v.y + col2*v.z + col3*v.w) + * ======================================================================== */ + +struct mat_34 { + vec_xyz col0, col1, col2, col3; + + mat_34() { set_zero(); } + mat_34(const vec_xyz& c0, const vec_xyz& c1, const vec_xyz& c2, const vec_xyz& c3) + : col0(c0), col1(c1), col2(c2), col3(c3) {} + + void set_zero() { col0.set_zero(); col1.set_zero(); col2.set_zero(); col3.set_zero(); } + + vec_xyzw get_row0() const { return {col0.x, col1.x, col2.x, col3.x}; } + vec_xyzw get_row1() const { return {col0.y, col1.y, col2.y, col3.y}; } + vec_xyzw get_row2() const { return {col0.z, col1.z, col2.z, col3.z}; } + + // mat_34 * vec_4 = vec_3 + vec_xyz operator*(const vec_xyzw& v) const { + return col0*v.x + col1*v.y + col2*v.z + col3*v.w; + } + vec_xyz operator*(const vector_t& v) const { + return col0*v.x + col1*v.y + col2*v.z; + } + vec_xyz operator*(const point_t& v) const { + return col0*v.x + col1*v.y + col2*v.z + col3; + } + + // Transpose-multiply: (mat_34)^T * vec_3 = vec_4 + // (mat_34^T).row_i = col_i, so result_i = dot(col_i, v) + vec_xyzw trans_mult(const vec_xyz& v) const { + return {col0.dot(v), col1.dot(v), col2.dot(v), col3.dot(v)}; + } + + mat_43 transpose() const; // defined after mat_43 is complete + + mat_34 operator-() const { + return {-col0, -col1, -col2, -col3}; + } + mat_34 operator+(const mat_34& m) const { + return {col0+m.col0, col1+m.col1, col2+m.col2, col3+m.col3}; + } + mat_34 operator-(const mat_34& m) const { + return {col0-m.col0, col1-m.col1, col2-m.col2, col3-m.col3}; + } + + // mat_34 * mat_43 = mat_33 + // result.col_i = (mat_34) * (mat_43).col_i [mat_34 * vec_4] + mat_33 operator*(const mat_43& m) const { + mat_33 r; + r.col0 = *this * m.col0; + r.col1 = *this * m.col1; + r.col2 = *this * m.col2; + return r; + } + + // mat_34 * mat_44 = mat_34 + mat_34 operator*(const mat_44& m) const; // defined after mat_44 +}; + +/* ======================================================================== + * mat_43::transpose() returns mat_34 – defined here after mat_34 is declared + * ======================================================================== */ + +inline mat_34 mat_43_to_mat_34(const mat_43& m) { + mat_34 r; + // Row i of mat_43 = (col0[i], col1[i], col2[i]) + // Col j of mat_34 (= row j of mat_43): + r.col0 = { m.col0.x, m.col1.x, m.col2.x }; // row0 as vec_3 + r.col1 = { m.col0.y, m.col1.y, m.col2.y }; // row1 as vec_3 + r.col2 = { m.col0.z, m.col1.z, m.col2.z }; // row2 as vec_3 + r.col3 = { m.col0.w, m.col1.w, m.col2.w }; // row3 as vec_3 + return r; +} + +inline mat_43 mat_34_to_mat_43(const mat_34& m) { + mat_43 r; + // Transpose of mat_34 (3x4) = mat_43 (4x3) + // col0 of result = row0 of mat_34 = (col0.x, col1.x, col2.x, col3.x) + r.col0 = { m.col0.x, m.col1.x, m.col2.x, m.col3.x }; + r.col1 = { m.col0.y, m.col1.y, m.col2.y, m.col3.y }; + r.col2 = { m.col0.z, m.col1.z, m.col2.z, m.col3.z }; + return r; +} + +// Row-vector * mat_43: vec_4^T * mat_43 = vec_3 +inline vec_xyz operator*(const vec_xyzw& v, const mat_43& m) { + return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2)}; +} + +// Row-vector * mat_34: vec_3^T * mat_34 = vec_4 +inline vec_xyzw operator*(const vec_xyz& v, const mat_34& m) { + return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2), v.dot(m.col3)}; +} + +/* ======================================================================== + * mat_44 (4×4 column-major matrix) + * ======================================================================== */ + +struct mat_44 { + vec_xyzw col0, col1, col2, col3; + + mat_44() { set_identity(); } + + // Construct from 4 column vectors + mat_44(const vec_xyzw& c0, const vec_xyzw& c1, const vec_xyzw& c2, const vec_xyzw& c3) + : col0(c0), col1(c1), col2(c2), col3(c3) {} + + void set_zero() { col0.set_zero(); col1.set_zero(); col2.set_zero(); col3.set_zero(); } + void set_identity() { + col0={1,0,0,0}; col1={0,1,0,0}; col2={0,0,1,0}; col3={0,0,0,1}; + } + void set_scale(const vec_xyz& s) { + col0={s.x,0,0,0}; col1={0,s.y,0,0}; col2={0,0,s.z,0}; col3={0,0,0,1}; + } + void set_translate(const vec_xyz& t) { + col0={1,0,0,0}; col1={0,1,0,0}; col2={0,0,1,0}; col3={t.x,t.y,t.z,1}; + } + + void set_row0(const vec_xyzw& r) { col0.x=r.x; col1.x=r.y; col2.x=r.z; col3.x=r.w; } + void set_row1(const vec_xyzw& r) { col0.y=r.x; col1.y=r.y; col2.y=r.z; col3.y=r.w; } + void set_row2(const vec_xyzw& r) { col0.z=r.x; col1.z=r.y; col2.z=r.z; col3.z=r.w; } + void set_row3(const vec_xyzw& r) { col0.w=r.x; col1.w=r.y; col2.w=r.z; col3.w=r.w; } + + vec_xyzw get_row0() const { return {col0.x, col1.x, col2.x, col3.x}; } + vec_xyzw get_row1() const { return {col0.y, col1.y, col2.y, col3.y}; } + vec_xyzw get_row2() const { return {col0.z, col1.z, col2.z, col3.z}; } + vec_xyzw get_row3() const { return {col0.w, col1.w, col2.w, col3.w}; } + + vec_xyzw get_col0() const { return col0; } + vec_xyzw get_col1() const { return col1; } + vec_xyzw get_col2() const { return col2; } + vec_xyzw get_col3() const { return col3; } + + // M * v + vec_xyzw operator*(const vec_xyzw& v) const { + vec_xyzw r; + r.x = col0.x*v.x + col1.x*v.y + col2.x*v.z + col3.x*v.w; + r.y = col0.y*v.x + col1.y*v.y + col2.y*v.z + col3.y*v.w; + r.z = col0.z*v.x + col1.z*v.y + col2.z*v.z + col3.z*v.w; + r.w = col0.w*v.x + col1.w*v.y + col2.w*v.z + col3.w*v.w; + return r; + } + + // Transpose-multiply: M^T * v + vec_xyzw trans_mult(const vec_xyzw& v) const { + return {col0.dot(v), col1.dot(v), col2.dot(v), col3.dot(v)}; + } + vec_xyzw trans_mult(const vector_t& v) const { + vec_xyzw v4(v.x, v.y, v.z, 0.0f); + return trans_mult(v4); + } + vec_xyzw trans_mult(const point_t& v) const { + vec_xyzw v4(v.x, v.y, v.z, 1.0f); + return trans_mult(v4); + } + + mat_44 transpose() const { + mat_44 r; + r.col0={col0.x,col1.x,col2.x,col3.x}; + r.col1={col0.y,col1.y,col2.y,col3.y}; + r.col2={col0.z,col1.z,col2.z,col3.z}; + r.col3={col0.w,col1.w,col2.w,col3.w}; + return r; + } + + mat_44 operator-() const { + mat_44 r; r.col0=-col0; r.col1=-col1; r.col2=-col2; r.col3=-col3; return r; + } + mat_44 operator+(const mat_44& m) const { + mat_44 r; r.col0=col0+m.col0; r.col1=col1+m.col1; + r.col2=col2+m.col2; r.col3=col3+m.col3; return r; + } + mat_44 operator-(const mat_44& m) const { + mat_44 r; r.col0=col0-m.col0; r.col1=col1-m.col1; + r.col2=col2-m.col2; r.col3=col3-m.col3; return r; + } + + // mat_44 * mat_44 + mat_44 operator*(const mat_44& m) const { + mat_44 r; + r.col0 = *this * m.col0; + r.col1 = *this * m.col1; + r.col2 = *this * m.col2; + r.col3 = *this * m.col3; + return r; + } + + // Transpose-multiply: M^T * A + mat_44 trans_mult(const mat_44& m) const { + mat_44 r; + r.col0 = trans_mult(m.col0); + r.col1 = trans_mult(m.col1); + r.col2 = trans_mult(m.col2); + r.col3 = trans_mult(m.col3); + return r; + } + + void set_rotate_x(float a) { + float c=cosf(a), s=sinf(a); + col0={1,0,0,0}; col1={0,c,s,0}; col2={0,-s,c,0}; col3={0,0,0,1}; + } + void set_rotate_y(float a) { + float c=cosf(a), s=sinf(a); + col0={c,0,-s,0}; col1={0,1,0,0}; col2={s,0,c,0}; col3={0,0,0,1}; + } + void set_rotate_z(float a) { + float c=cosf(a), s=sinf(a); + col0={c,s,0,0}; col1={-s,c,0,0}; col2={0,0,1,0}; col3={0,0,0,1}; + } + + // mat_44 * mat_43 = mat_43 + // result.col_i = (mat_44) * (mat_43).col_i [mat_44 * vec_4] + mat_43 operator*(const mat_43& m) const { + mat_43 r; + r.col0 = *this * m.col0; + r.col1 = *this * m.col1; + r.col2 = *this * m.col2; + return r; + } +}; + +// Row-vector * mat_44: vec_4^T * mat_44 = vec_4 +inline vec_xyzw operator*(const vec_xyzw& v, const mat_44& m) { + return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2), v.dot(m.col3)}; +} + +// mat_34 * mat_44 = mat_34 (defined here now that mat_44 is complete) +inline mat_34 mat_34_mult_mat_44(const mat_34& a, const mat_44& b) { + // result.col_i = a * b.col_i [mat_34 * vec_4] + mat_34 r; + r.col0 = a * b.col0; + r.col1 = a * b.col1; + r.col2 = a * b.col2; + r.col3 = a * b.col3; + return r; +} + +/* ======================================================================== + * mat_43::transpose / mat_34::transpose – deferred method bodies + * ======================================================================== */ + +// These must live outside the struct bodies since both mat_43 and mat_34 +// need to be complete before defining each other's transpose. + +/* ======================================================================== + * transform_t (rigid body transform: 3×3 rotation + 3D translation) + * + * Stored as 4×4 column-major matrix: + * col0, col1, col2 = rotation columns (w=0) + * col3 = translation (w=1) + * ======================================================================== */ + +struct transform_t { + vec_xyzw col0, col1, col2, col3; + + transform_t() { set_identity(); } + + void set_identity() { + col0={1,0,0,0}; col1={0,1,0,0}; col2={0,0,1,0}; col3={0,0,0,1}; + } + void set_zero() { + col0.set_zero(); col1.set_zero(); col2.set_zero(); col3.set_zero(); + } + + // Row access (includes translation column) + vec_xyzw get_row0() const { return {col0.x, col1.x, col2.x, col3.x}; } + vec_xyzw get_row1() const { return {col0.y, col1.y, col2.y, col3.y}; } + vec_xyzw get_row2() const { return {col0.z, col1.z, col2.z, col3.z}; } + vec_xyzw get_row3() const { return {col0.w, col1.w, col2.w, col3.w}; } + + // Apply to 4D vector: result = R*v.xyz + t*v.w + vec_xyzw operator*(const vec_xyzw& v) const { + vec_xyzw r; + r.x = col0.x*v.x + col1.x*v.y + col2.x*v.z + col3.x*v.w; + r.y = col0.y*v.x + col1.y*v.y + col2.y*v.z + col3.y*v.w; + r.z = col0.z*v.x + col1.z*v.y + col2.z*v.z + col3.z*v.w; + r.w = col0.w*v.x + col1.w*v.y + col2.w*v.z + col3.w*v.w; + return r; + } + + // Apply to direction (w=0): rotation only + vector_t operator*(const vector_t& v) const { + return { col0.x*v.x + col1.x*v.y + col2.x*v.z, + col0.y*v.x + col1.y*v.y + col2.y*v.z, + col0.z*v.x + col1.z*v.y + col2.z*v.z }; + } + + // Apply to point (w=1): rotation + translation + point_t operator*(const point_t& v) const { + return { col0.x*v.x + col1.x*v.y + col2.x*v.z + col3.x, + col0.y*v.x + col1.y*v.y + col2.y*v.z + col3.y, + col0.z*v.x + col1.z*v.y + col2.z*v.z + col3.z }; + } + + // Compose transforms + transform_t operator*(const transform_t& b) const { + transform_t r; + r.col0 = {col0.x*b.col0.x + col1.x*b.col0.y + col2.x*b.col0.z, + col0.y*b.col0.x + col1.y*b.col0.y + col2.y*b.col0.z, + col0.z*b.col0.x + col1.z*b.col0.y + col2.z*b.col0.z, + 0.0f}; + r.col1 = {col0.x*b.col1.x + col1.x*b.col1.y + col2.x*b.col1.z, + col0.y*b.col1.x + col1.y*b.col1.y + col2.y*b.col1.z, + col0.z*b.col1.x + col1.z*b.col1.y + col2.z*b.col1.z, + 0.0f}; + r.col2 = {col0.x*b.col2.x + col1.x*b.col2.y + col2.x*b.col2.z, + col0.y*b.col2.x + col1.y*b.col2.y + col2.y*b.col2.z, + col0.z*b.col2.x + col1.z*b.col2.y + col2.z*b.col2.z, + 0.0f}; + r.col3 = {col0.x*b.col3.x + col1.x*b.col3.y + col2.x*b.col3.z + col3.x, + col0.y*b.col3.x + col1.y*b.col3.y + col2.y*b.col3.z + col3.y, + col0.z*b.col3.x + col1.z*b.col3.y + col2.z*b.col3.z + col3.z, + 1.0f}; + return r; + } + + // Orthonormal inverse: inv = { R^T, -R^T*t, 0,0,0,1 } + // inv.col_i (rotation part) = (col0[i], col1[i], col2[i], 0) [transpose of rotation cols] + // inv.col3 = (-dot(col0.xyz, col3.xyz), -dot(col1.xyz, col3.xyz), -dot(col2.xyz, col3.xyz), 1) + transform_t orthonormal_inverse() const { + transform_t r; + // Transposed rotation + r.col0 = {col0.x, col1.x, col2.x, 0.0f}; + r.col1 = {col0.y, col1.y, col2.y, 0.0f}; + r.col2 = {col0.z, col1.z, col2.z, 0.0f}; + // Translation: -R^T * t + float tx = col3.x, ty = col3.y, tz = col3.z; + r.col3 = {-(col0.x*tx + col0.y*ty + col0.z*tz), + -(col1.x*tx + col1.y*ty + col1.z*tz), + -(col2.x*tx + col2.y*ty + col2.z*tz), + 1.0f}; + return r; + } + + void orthonormal_inverse_in_place() { + *this = orthonormal_inverse(); + } + + // General inverse (uses orthonormal_inverse since transform_t is always rigid) + transform_t inverse() const { + return orthonormal_inverse(); + } + + void set_rotation(const mat_33& rot) { + col0 = {rot.col0.x, rot.col0.y, rot.col0.z, 0.0f}; + col1 = {rot.col1.x, rot.col1.y, rot.col1.z, 0.0f}; + col2 = {rot.col2.x, rot.col2.y, rot.col2.z, 0.0f}; + } + void set_translation(const vec_xyz& t) { + col3 = {t.x, t.y, t.z, 1.0f}; + } + void set_rotate_x(float a) { + float c=cosf(a), s=sinf(a); + col0={1,0,0,0}; col1={0,c,s,0}; col2={0,-s,c,0}; col3={0,0,0,1}; + } + void set_rotate_y(float a) { + float c=cosf(a), s=sinf(a); + col0={c,0,-s,0}; col1={0,1,0,0}; col2={s,0,c,0}; col3={0,0,0,1}; + } + void set_rotate_z(float a) { + float c=cosf(a), s=sinf(a); + col0={c,s,0,0}; col1={-s,c,0,0}; col2={0,0,1,0}; col3={0,0,0,1}; + } +}; + +/* ======================================================================== + * Deferred member function bodies (all types now complete) + * ======================================================================== */ + +// mat_43::transpose() -> mat_34 +inline mat_34 mat_43::transpose() const { return mat_43_to_mat_34(*this); } + +// mat_34::transpose() -> mat_43 +inline mat_43 mat_34::transpose() const { return mat_34_to_mat_43(*this); } + +// mat_34 * mat_44 -> mat_34 +inline mat_34 mat_34::operator*(const mat_44& m) const { + return mat_34_mult_mat_44(*this, m); +} + diff --git a/include/ps2s/cpu_matrix.h b/include/ps2s/cpu_matrix.h index 3655312..a86b422 100644 --- a/include/ps2s/cpu_matrix.h +++ b/include/ps2s/cpu_matrix.h @@ -80,6 +80,7 @@ class cpu_mat_44 { inline cpu_vec_4 cpu_mat_44::operator*(const cpu_vec_4& rhs) const { +#if defined(NO_ASM) || defined(NO_VU0_VECTORS) cpu_vec_4 result; cpu_vec_4 row0(col0(0), col1(0), col2(0), col3(0)); @@ -95,11 +96,33 @@ inline cpu_vec_4 result[3] = row3.dot(rhs); return result; +#else + // VU0 macro-mode: result = col0*rhs.x + col1*rhs.y + col2*rhs.z + col3*rhs.w + // Requires cpu_vec_4 / cpu_mat_44 to be 16-byte aligned (enforced via alignas). + cpu_vec_4 result; + asm volatile( + " ### cpu_mat_44 * cpu_vec_4 ### \n" + "lqc2 $vf1, 0(%[M]) \n" + "lqc2 $vf2, 16(%[M]) \n" + "lqc2 $vf3, 32(%[M]) \n" + "lqc2 $vf4, 48(%[M]) \n" + "lqc2 $vf5, 0(%[V]) \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "sqc2 $vf6, 0(%[R]) \n" + : + : [M] "r"(this), [V] "r"(&rhs), [R] "r"(&result) + : "memory"); + return result; +#endif } inline cpu_mat_44 cpu_mat_44::operator*(const cpu_mat_44& rhs) const { +#if defined(NO_ASM) || defined(NO_VU0_VECTORS) cpu_mat_44 result; result.col0 = *this * rhs.get_col0(); @@ -108,6 +131,48 @@ inline cpu_mat_44 result.col3 = *this * rhs.get_col3(); return result; +#else + // VU0 macro-mode: load this once into vf1..vf4, apply to each col of rhs. + cpu_mat_44 result; + asm volatile( + " ### cpu_mat_44 * cpu_mat_44 ### \n" + "lqc2 $vf1, 0(%[A]) \n" + "lqc2 $vf2, 16(%[A]) \n" + "lqc2 $vf3, 32(%[A]) \n" + "lqc2 $vf4, 48(%[A]) \n" + + "lqc2 $vf5, 0(%[B]) \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "sqc2 $vf6, 0(%[R]) \n" + + "lqc2 $vf5, 16(%[B]) \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "sqc2 $vf6, 16(%[R]) \n" + + "lqc2 $vf5, 32(%[B]) \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "sqc2 $vf6, 32(%[R]) \n" + + "lqc2 $vf5, 48(%[B]) \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "sqc2 $vf6, 48(%[R]) \n" + : + : [A] "r"(this), [B] "r"(&rhs), [R] "r"(&result) + : "memory"); + return result; +#endif } #endif // ps2s_cpu_matrix_h diff --git a/include/ps2s/cpu_vector.h b/include/ps2s/cpu_vector.h index 4da0508..f0de4ed 100644 --- a/include/ps2s/cpu_vector.h +++ b/include/ps2s/cpu_vector.h @@ -70,8 +70,8 @@ class cpu_vec_3 { "mtc1 _temp0, _z # z = value.z \n" ".endif \n" - : "=&r,f _x"(x), "=&r,f _y"(y), "=&r,f _z"(z), "=&r,&r _temp0"(temp0) - : "r,r _vec"(vec)); + : "=&r,&f"(x), "=&r,&f"(y), "=&r,&f"(z), "=&r"(temp0) + : "r"(vec)); } explicit inline cpu_vec_3(const vec_3 vec) { set(vec.vec128); } @@ -156,7 +156,7 @@ typedef cpu_vec_3 cpu_vec_xyz; * cpu_vec_4 (or cpu_vec_xyzw) */ -class cpu_vec_4 { +class alignas(16) cpu_vec_4 { public: float x, y, z, w; @@ -215,8 +215,8 @@ class cpu_vec_4 { "mtc1 _temp0, _w # w = value.w \n" ".endif \n" - : "=&r,f _x"(x), "=&r,f _y"(y), "=&r,f _z"(z), "=&r,f _w"(w), "=&r,&r _temp0"(temp0) - : "r,r _vec"(vec)); + : "=&r,&f"(x), "=&r,&f"(y), "=&r,&f"(z), "=&r,&f"(w), "=&r"(temp0) + : "r"(vec)); } explicit inline cpu_vec_4(const vec_4 vec) { set(vec.vec128); } @@ -335,12 +335,12 @@ cpu_vec_3::operator+(const cpu_vec_3& vec) #else asm(" ### cpu_vec_3 + cpu_vec_3 ### \n" - "add.s rx, v0x, v1x \n" - "add.s ry, v0y, v1y \n" - "add.s rz, v0z, v1z \n" - : "=&f rx"(result.x), "=&f ry"(result.y), "=&f rz"(result.z) - : "f v0x"(x), "f v0y"(y), "f v0z"(z), - "f v1x"(vec.x), "f v1y"(vec.y), "f v1z"(vec.z)); + "add.s %[rx], %[v0x], %[v1x] \n" + "add.s %[ry], %[v0y], %[v1y] \n" + "add.s %[rz], %[v0z], %[v1z] \n" + : [rx] "=&f"(result.x), [ry] "=&f"(result.y), [rz] "=&f"(result.z) + : [v0x] "f"(x), [v0y] "f"(y), [v0z] "f"(z), + [v1x] "f"(vec.x), [v1y] "f"(vec.y), [v1z] "f"(vec.z)); #endif return result; @@ -359,12 +359,12 @@ cpu_vec_3::operator-(const cpu_vec_3& vec) #else asm(" ### cpu_vec_3 - cpu_vec_3 ### \n" - "sub.s rx, v0x, v1x \n" - "sub.s ry, v0y, v1y \n" - "sub.s rz, v0z, v1z \n" - : "=&f rx"(result.x), "=&f ry"(result.y), "=&f rz"(result.z) - : "f v0x"(x), "f v0y"(y), "f v0z"(z), - "f v1x"(vec.x), "f v1y"(vec.y), "f v1z"(vec.z)); + "sub.s %[rx], %[v0x], %[v1x] \n" + "sub.s %[ry], %[v0y], %[v1y] \n" + "sub.s %[rz], %[v0z], %[v1z] \n" + : [rx] "=&f"(result.x), [ry] "=&f"(result.y), [rz] "=&f"(result.z) + : [v0x] "f"(x), [v0y] "f"(y), [v0z] "f"(z), + [v1x] "f"(vec.x), [v1y] "f"(vec.y), [v1z] "f"(vec.z)); #endif return result; @@ -488,12 +488,21 @@ inline cpu_vec_4 cpu_vec_4::operator+(const cpu_vec_4& vec) { cpu_vec_4 result; - +#if defined(NO_ASM) || defined(NO_VU0_VECTORS) result.x = x + vec.x; result.y = y + vec.y; result.z = z + vec.z; result.w = w + vec.w; - +#else + asm volatile(" ### cpu_vec_4 + cpu_vec_4 (VU0) ### \n" + "lqc2 $vf1, 0(%[A]) \n" + "lqc2 $vf2, 0(%[B]) \n" + "vadd.xyzw $vf3, $vf1, $vf2 \n" + "sqc2 $vf3, 0(%[R]) \n" + : + : [A] "r"(this), [B] "r"(&vec), [R] "r"(&result) + : "memory"); +#endif return result; } @@ -501,12 +510,21 @@ inline cpu_vec_4 cpu_vec_4::operator-(const cpu_vec_4& vec) { cpu_vec_4 result; - +#if defined(NO_ASM) || defined(NO_VU0_VECTORS) result.x = x - vec.x; result.y = y - vec.y; result.z = z - vec.z; result.w = w - vec.w; - +#else + asm volatile(" ### cpu_vec_4 - cpu_vec_4 (VU0) ### \n" + "lqc2 $vf1, 0(%[A]) \n" + "lqc2 $vf2, 0(%[B]) \n" + "vsub.xyzw $vf3, $vf1, $vf2 \n" + "sqc2 $vf3, 0(%[R]) \n" + : + : [A] "r"(this), [B] "r"(&vec), [R] "r"(&result) + : "memory"); +#endif return result; } @@ -514,12 +532,21 @@ inline cpu_vec_4 cpu_vec_4::operator-() { cpu_vec_4 result; - +#if defined(NO_ASM) || defined(NO_VU0_VECTORS) result.x = -x; result.y = -y; result.z = -z; result.w = -w; - +#else + asm volatile(" ### -cpu_vec_4 (VU0) ### \n" + "lqc2 $vf1, 0(%[A]) \n" + "vsub.xyzw $vf2, $vf1, $vf1 \n" // vf2 = 0 (vf0.w is 1.0, can't use it) + "vsub.xyzw $vf3, $vf2, $vf1 \n" // vf3 = 0 - v + "sqc2 $vf3, 0(%[R]) \n" + : + : [A] "r"(this), [R] "r"(&result) + : "memory"); +#endif return result; } @@ -527,12 +554,21 @@ inline cpu_vec_4 cpu_vec_4::operator*(const cpu_vec_4& vec) { cpu_vec_4 result; - +#if defined(NO_ASM) || defined(NO_VU0_VECTORS) result.x = x * vec.x; result.y = y * vec.y; result.z = z * vec.z; result.w = w * vec.w; - +#else + asm volatile(" ### cpu_vec_4 * cpu_vec_4 (VU0) ### \n" + "lqc2 $vf1, 0(%[A]) \n" + "lqc2 $vf2, 0(%[B]) \n" + "vmul.xyzw $vf3, $vf1, $vf2 \n" + "sqc2 $vf3, 0(%[R]) \n" + : + : [A] "r"(this), [B] "r"(&vec), [R] "r"(&result) + : "memory"); +#endif return result; } diff --git a/include/ps2s/matrix.h b/include/ps2s/matrix.h index 2fd63bb..df6aee6 100644 --- a/include/ps2s/matrix.h +++ b/include/ps2s/matrix.h @@ -51,70 +51,82 @@ class mat_33 : public mat_x3_template { void set_identity() { - asm(" ### mat_33::set_identity ### \n" - "vsub %[col0], %[col0], %[col0] \n" - "vsub %[col1], %[col1], %[col1] \n" - "vmr32 %[col2], vf00 \n" - "vaddw.x %[col0], vf00, vf00 \n" - "vaddw.y %[col1], vf00, vf00 \n" - : [col0] "=j"(col0), - [col1] "=j"(col1), - [col2] "=j"(col2)); + asm __volatile__( + "vsub $vf1, $vf0, $vf0 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vmr32 $vf3, $vf0 \n" + "vaddw.x $vf1, $vf1, $vf0 \n" + "vaddw.y $vf2, $vf2, $vf0 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128) + : + : "memory"); } void set_scale(vec_3 scale) { set_zero(); - col0 = (vec_x)scale; - col1 = (vec_y)scale; - col2 = (vec_z)scale; + // Extract components and set diagonal elements using VU0 + float sx = (float)vec_x(scale); + float sy = (float)vec_y(scale); + float sz = (float)vec_z(scale); + asm __volatile__( + "ctc2 %[sx], $vi21 \n" + "vaddi.x $vf1, $vf0, $I \n" + "ctc2 %[sy], $vi21 \n" + "vaddi.y $vf2, $vf0, $I \n" + "ctc2 %[sz], $vi21 \n" + "vaddi.z $vf3, $vf0, $I \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128) + : [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz) + : "memory"); } void set(vec_4 quat) { - vec128_t ones, q_yzx, q_zxy; - - asm("### set rotation from unit quaternion ### \n" - - "vmr32.xy %[q_yzx], %[quat] \n" - "vaddx.z %[q_yzx], vf00, %[quat] \n" - "vaddz.x %[q_zxy], vf00, %[quat] \n" - "vaddx.y %[q_zxy], vf00, %[quat] \n" - "vaddy.z %[q_zxy], vf00, %[quat] \n" - - "vmula ACC, %[quat], %[q_yzx] \n" - "vmsubw.z %[col0], %[q_zxy], %[quat] \n" - "vmsubw.x %[col1], %[q_zxy], %[quat] \n" - "vmsubw.y %[col2], %[q_zxy], %[quat] \n" - - "vmula ACC, %[quat], %[q_zxy] \n" - "vmaddw.y %[col0], %[q_yzx], %[quat] \n" - "vmaddw.z %[col1], %[q_yzx], %[quat] \n" - "vmaddw.x %[col2], %[q_yzx], %[quat] \n" - - "vmula ACC, %[q_zxy], %[q_zxy] \n" - "vmadd.x %[col0], %[q_yzx], %[q_yzx] \n" - "vmadd.y %[col1], %[q_yzx], %[q_yzx] \n" - "vmadd.z %[col2], %[q_yzx], %[q_yzx] \n" - - "vmaxw %[ones], vf00, vf00 \n" - "vadd %[col0], %[col0], %[col0] \n" - "vadd %[col1], %[col1], %[col1] \n" - "vadd %[col2], %[col2], %[col2] \n" - - "vsub.x %[col0], %[ones], %[col0] \n" - "vsub.y %[col1], %[ones], %[col1] \n" - "vsub.z %[col2], %[ones], %[col2] \n" - - : [col0] "=&j"(col0), - [col1] "=&j"(col1), - [col2] "=&j"(col2), - [q_yzx] "=&j"(q_yzx), - [q_zxy] "=&j"(q_zxy), - [ones] "=&j"(ones), "=r"(vu0_ACC) - : [quat] "j"(quat)); + vec128_t qv = quat.vec128; + vec128_t col0_v128, col1_v128, col2_v128; + asm __volatile__ ("### set rotation from unit quaternion ### \n" + "qmtc2 %[quat], $vf1 \n" + "vmr32.xy $vf2, $vf1 \n" + "vaddx.z $vf2, $vf0, $vf1 \n" + "vaddz.x $vf3, $vf0, $vf1 \n" + "vaddx.y $vf3, $vf0, $vf1 \n" + "vaddy.z $vf3, $vf0, $vf1 \n" + "vmula $ACC, $vf1, $vf2 \n" + "vmsubw.z $vf4, $vf3, $vf1 \n" + "vmsubw.x $vf5, $vf3, $vf1 \n" + "vmsubw.y $vf6, $vf3, $vf1 \n" + "vmula $ACC, $vf1, $vf3 \n" + "vmaddw.y $vf4, $vf2, $vf1 \n" + "vmaddw.z $vf5, $vf2, $vf1 \n" + "vmaddw.x $vf6, $vf2, $vf1 \n" + "vmula $ACC, $vf3, $vf3 \n" + "vmadd.x $vf4, $vf2, $vf2 \n" + "vmadd.y $vf5, $vf2, $vf2 \n" + "vmadd.z $vf6, $vf2, $vf2 \n" + "vmaxw $vf7, $vf0, $vf0 \n" + "vadd $vf4, $vf4, $vf4 \n" + "vadd $vf5, $vf5, $vf5 \n" + "vadd $vf6, $vf6, $vf6 \n" + "vsub.x $vf4, $vf7, $vf4 \n" + "vsub.y $vf5, $vf7, $vf5 \n" + "vsub.z $vf6, $vf7, $vf6 \n" + "qmfc2 %[c0], $vf4 \n" + "qmfc2 %[c1], $vf5 \n" + "qmfc2 %[c2], $vf6 \n" + : [c0] "=r"(col0_v128), [c1] "=r"(col1_v128), [c2] "=r"(col2_v128), "=r"(vu0_ACC) + : [quat] "r"(qv)); + col0.vec128 = col0_v128; + col1.vec128 = col1_v128; + col2.vec128 = col2_v128; } mat_33(vec_4 quat) @@ -219,25 +231,26 @@ class mat_33 : public mat_x3_template { vec128_t result, temp0, temp1, temp2, ones; asm("### mat_33 trans_mult vec_3 ### \n" - "vmul %[temp0], col0], %[vec] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddz.x %[result], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddz.y %[result], %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmaddy.z %[result], %[ones], %[temp2] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), - [temp1] "=&j"(temp1), - [temp2] "=&j"(temp2), - [ones] "=&j"(ones), "=r"(vu0_ACC) - : "j col0"(col0), - "j col1"(col1), - "j col2"(col2), - "j vec"(vec)); + "qmtc2 %[col0], $vf10 \n" + "qmtc2 %[col1], $vf11 \n" + "qmtc2 %[col2], $vf12 \n" + "qmtc2 %[vec], $vf13 \n" + "vmul $vf14, $vf10, $vf13 \n" + "vmaxw $vf15, $vf0, $vf0 \n" + "vmul $vf16, $vf11, $vf13 \n" + "vmul $vf17, $vf12, $vf13 \n" + "vadday.x $ACC, $vf14, $vf14 \n" + "vmaddz.x $vf14, $vf15, $vf14 \n" + "vaddax.y $ACC, $vf16, $vf16 \n" + "vmaddz.y $vf14, $vf15, $vf16 \n" + "vaddax.z $ACC, $vf17, $vf17 \n" + "vmaddy.z $vf14, $vf15, $vf17 \n" + "qmfc2 %[result], $vf14 \n" + : [result] "=&r"(result), "=r"(vu0_ACC) + : [col0] "r"(col0.vec128), + [col1] "r"(col1.vec128), + [col2] "r"(col2.vec128), + [vec] "r"(vec.vec128)); return vec_3(result); } @@ -383,81 +396,102 @@ class mat_43 : public mat_x3_template { vec_3 trans_mult(vec_4 vec) const { - vec128_t result, temp0, temp1, temp2, ones; - - asm("### mat_43 trans_mult vec_4 ### \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddaz.x ACC, %[ones], %[temp0] \n" - "vmaddw.x %[result], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddaz.y ACC, %[ones], %[temp1] \n" - "vmaddw.y %[result], %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmadday.z ACC, %[ones], %[temp2] \n" - "vmaddw.z %[result], %[ones], %[temp2] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [ones] "=&j"(ones), "=r"(vu0_ACC) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [vec] "j"(vec)); - + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_43 trans_mult vec_4 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vmaxw $vf5, $vf0, $vf0 \n" + "vmul $vf6, $vf1, $vf4 \n" + "vmul $vf7, $vf2, $vf4 \n" + "vmul $vf8, $vf3, $vf4 \n" + "vadday.x $ACC, $vf6, $vf6 \n" + "vmaddaz.x $ACC, $vf5, $vf6 \n" + "vmaddw.x $vf9, $vf5, $vf6 \n" + "vaddax.y $ACC, $vf7, $vf7 \n" + "vmaddaz.y $ACC, $vf5, $vf7 \n" + "vmaddw.y $vf9, $vf5, $vf7 \n" + "vaddax.z $ACC, $vf8, $vf8 \n" + "vmadday.z $ACC, $vf5, $vf8 \n" + "vmaddw.z $vf9, $vf5, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [vec_val] "r"(vec_val) + : "memory"); return vec_3(result); } vec_3 trans_mult(vector_t vec) const { - vec128_t result, temp0, temp1, temp2, ones; - - asm("### mat_43 trans_mult vector_t ### \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddz.x %[result], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddz.y %[result, %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmaddy.z %[result], %[ones], %[temp2] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [ones] "=&j"(ones), "=r"(vu0_ACC) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [vec] "j"(vec)); - + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_43 trans_mult vector_t ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vmaxw $vf5, $vf0, $vf0 \n" + "vmul $vf6, $vf1, $vf4 \n" + "vmul $vf7, $vf2, $vf4 \n" + "vmul $vf8, $vf3, $vf4 \n" + "vadday.x $ACC, $vf6, $vf6 \n" + "vmaddz.x $vf9, $vf5, $vf6 \n" + "vaddax.y $ACC, $vf7, $vf7 \n" + "vmaddz.y $vf9, $vf5, $vf7 \n" + "vaddax.z $ACC, $vf8, $vf8 \n" + "vmaddy.z $vf9, $vf5, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [vec_val] "r"(vec_val) + : "memory"); return vec_3(result); } vec_3 trans_mult(point_t vec) const { - vec128_t result, temp0, temp1, temp2, ones; - - asm("### mat_43 trans_mult point_t ### \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddaz.x ACC, %[ones], %[temp0] \n" - "vmaddw.x %[result], %[ones], %[col0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddaz.y ACC, %[ones], %[temp1] \n" - "vmaddw.y %[result], %[ones], %[col1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmadday.z ACC, %[ones], %[temp2] \n" - "vmaddw.z %[result], %[ones], %[col2] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [ones] "=&j"(ones), "=r"(vu0_ACC) - : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec)); - + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_43 trans_mult point_t ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vmaxw $vf5, $vf0, $vf0 \n" + "vmul $vf6, $vf1, $vf4 \n" + "vmul $vf7, $vf2, $vf4 \n" + "vmul $vf8, $vf3, $vf4 \n" + "vadday.x $ACC, $vf6, $vf6 \n" + "vmaddaz.x $ACC, $vf5, $vf6 \n" + "vmaddw.x $vf9, $vf5, $vf1 \n" + "vaddax.y $ACC, $vf7, $vf7 \n" + "vmaddaz.y $ACC, $vf5, $vf7 \n" + "vmaddw.y $vf9, $vf5, $vf2 \n" + "vaddax.z $ACC, $vf8, $vf8 \n" + "vmadday.z $ACC, $vf5, $vf8 \n" + "vmaddw.z $vf9, $vf5, $vf3 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [vec_val] "r"(vec_val) + : "memory"); return vec_3(result); } @@ -622,26 +656,37 @@ class mat_34 : public mat_x4_template { vec_4 trans_mult(vec_3 vec) const { - vec128_t result, temp0, temp1, temp2, temp3; - - asm("### mat_34 trans_mult vec_3 ### \n" - "vmul %[temp3], %[col3], %[vec] \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vmulx.w %[temp3], vf00, %[temp3] \n" - "vaddy.x %[temp0], %[temp0], %[temp0] \n" - "vaddx.y %[temp1], %[temp1], %[temp1] \n" - "vaddx.z %[temp2], %[temp2], %[temp2] \n" - "vaddy.w %[temp3], %[temp3], %[temp3] \n" - "vaddz.x %[result], %[temp0], %[temp0] \n" - "vaddz.y %[result], %[temp1], %[temp1] \n" - "vaddy.z %[result], %[temp2], %[temp2] \n" - "vaddz.w %[result], %[temp3], %[temp3] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3) - : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [col3] "j"(col3), [vec] "j"(vec)); - + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_34 trans_mult vec_3 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vmul $vf7, $vf4, $vf5 \n" + "vmul $vf6, $vf1, $vf5 \n" + "vmul $vf8, $vf2, $vf5 \n" + "vmul $vf9, $vf3, $vf5 \n" + "vmulx.w $vf7, $vf0, $vf7 \n" + "vaddy.x $vf6, $vf6, $vf6 \n" + "vaddx.y $vf8, $vf8, $vf8 \n" + "vaddx.z $vf9, $vf9, $vf9 \n" + "vaddy.w $vf7, $vf7, $vf7 \n" + "vaddz.x $vf10, $vf6, $vf6 \n" + "vaddz.y $vf10, $vf8, $vf8 \n" + "vaddy.z $vf10, $vf9, $vf9 \n" + "vaddz.w $vf10, $vf7, $vf7 \n" + "qmfc2 %[result], $vf10 \n" + : [result] "=&r"(result) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val) + : "memory"); return vec_4(result); } @@ -742,43 +787,91 @@ class mat_44 : public mat_x4_template { void set_identity() { - asm(" ### mat_44::set_identity ### \n" - "vsub %[col0], %[col0], %[col0] \n" - "vsub %[col1], %[col1], %[col1] \n" - "vmr32 %[col2], vf00 \n" - "vmove %[col3], vf00 \n" - "vaddw.x %[col0], vf00, vf00 \n" - "vaddw.y %[col1], vf00, vf00 \n" - : [col0] "=j"(col0), - [col1] "=j"(col1), - [col2] "=j"(col2), - [col3] "=j"(col3)); + asm __volatile__( + "vsub $vf1, $vf0, $vf0 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vmr32 $vf3, $vf0 \n" + "vmove $vf4, $vf0 \n" + "vaddw.x $vf1, $vf1, $vf0 \n" + "vaddw.y $vf2, $vf2, $vf0 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128) + : + : "memory"); } void set_scale(vec_3 scale) { - set_identity(); - col0 = (vec_x)scale; - col1 = (vec_y)scale; - col2 = (vec_z)scale; + set_zero(); + // Extract components and set diagonal elements using VU0 + float sx = (float)vec_x(scale); + float sy = (float)vec_y(scale); + float sz = (float)vec_z(scale); + asm __volatile__( + "ctc2 %[sx], $vi21 \n" + "vaddi.x $vf1, $vf0, $I \n" + "ctc2 %[sy], $vi21 \n" + "vaddi.y $vf2, $vf0, $I \n" + "ctc2 %[sz], $vi21 \n" + "vaddi.z $vf3, $vf0, $I \n" + "vmove $vf4, $vf0 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128) + : [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz) + : "memory"); } void set_scale(vec_4 scale) { set_identity(); - col0 = (vec_x)scale; - col1 = (vec_y)scale; - col2 = (vec_z)scale; - col3 = (vec_w)scale; + // Extract components and set diagonal elements using VU0 + float sx = (float)vec_x(scale); + float sy = (float)vec_y(scale); + float sz = (float)vec_z(scale); + float sw = (float)vec_w(scale); + asm __volatile__( + "vsub $vf1, $vf0, $vf0 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vsub $vf3, $vf0, $vf0 \n" + "vsub $vf4, $vf0, $vf0 \n" + "ctc2 %[sx], $vi21 \n" + "vaddi.x $vf1, $vf0, $I \n" + "ctc2 %[sy], $vi21 \n" + "vaddi.y $vf2, $vf0, $I \n" + "ctc2 %[sz], $vi21 \n" + "vaddi.z $vf3, $vf0, $I \n" + "ctc2 %[sw], $vi21 \n" + "vmuli.w $vf4, $vf0, $I \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128) + : [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz), [sw] "r"(sw) + : "memory"); } void set_translate(vec_3 xlate_amount) { set_identity(); - col3 = xlate_amount; + // Set translation in col3, preserving w=1 from set_identity + float tx = (float)vec_x(xlate_amount); + float ty = (float)vec_y(xlate_amount); + float tz = (float)vec_z(xlate_amount); + col3.set_x(tx); + col3.set_y(ty); + col3.set_z(tz); + // w should already be 1.0 from set_identity, but ensure it + col3.set_w(1.0f); } void @@ -882,95 +975,117 @@ class mat_44 : public mat_x4_template { vec_4 trans_mult(vec_4 vec) const { - vec128_t result, temp0, temp1, temp2, temp3; - - asm("### mat_44 trans_mult vec_4 ### \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vmul %[temp3], %[col3], %[vec] \n" - "vaddy.x %[temp0], %[temp0], %[temp0] \n" - "vaddx.y %[temp1], %[temp1], %[temp1] \n" - "vaddx.z %[temp2], %[temp2], %[temp2] \n" - "vaddx.w %[temp3], %[temp3], %[temp3] \n" - "vaddz.x %[temp0], %[temp0], %[temp0] \n" - "vaddz.y %[temp1], %[temp1], %[temp1] \n" - "vaddy.z %[temp2], %[temp2], %[temp2] \n" - "vaddy.w %[temp3], %[temp3], %[temp3] \n" - "vaddw.x %[result], %[temp0], %[temp0] \n" - "vaddw.y %[result], %[temp1], %[temp1] \n" - "vaddw.z %[result], %[temp2], %[temp2] \n" - "vaddz.w %[result], %[temp3], %[temp3] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3), - [vec] "j"(vec)); - + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_44 trans_mult vec_4 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vmul $vf6, $vf1, $vf5 \n" + "vmul $vf7, $vf2, $vf5 \n" + "vmul $vf8, $vf3, $vf5 \n" + "vmul $vf9, $vf4, $vf5 \n" + "vaddy.x $vf6, $vf6, $vf6 \n" + "vaddx.y $vf7, $vf7, $vf7 \n" + "vaddx.z $vf8, $vf8, $vf8 \n" + "vaddx.w $vf9, $vf9, $vf9 \n" + "vaddz.x $vf6, $vf6, $vf6 \n" + "vaddz.y $vf7, $vf7, $vf7 \n" + "vaddy.z $vf8, $vf8, $vf8 \n" + "vaddy.w $vf9, $vf9, $vf9 \n" + "vaddw.x $vf10, $vf6, $vf6 \n" + "vaddw.y $vf10, $vf7, $vf7 \n" + "vaddw.z $vf10, $vf8, $vf8 \n" + "vaddz.w $vf10, $vf9, $vf9 \n" + "qmfc2 %[result], $vf10 \n" + : [result] "=&r"(result) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val) + : "memory"); return vec_4(result); } vec_4 trans_mult(vector_t vec) const { - vec128_t result, temp0, temp1, temp2, temp3; - - asm("### mat_44 trans_mult vector_t ### \n" - "vmul %[temp3], %[col3], %[vec] \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vmulx.w %[temp3], vf00, %[temp3] \n" - "vaddy.x %[temp0], %[temp0], %[temp0] \n" - "vaddx.y %[temp1], %[temp1], %[temp1] \n" - "vaddx.z %[temp2], %[temp2], %[temp2] \n" - "vaddy.w %[temp3], %[temp3], %[temp3] \n" - "vaddz.x %[result], %[temp0], %[temp0] \n" - "vaddz.y %[result], %[temp1], %[temp1] \n" - "vaddy.z %[result], %[temp2], %[temp2] \n" - "vaddz.w %[result], %[temp3], %[temp3] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3), - [vec] "j"(vec)); - + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_44 trans_mult vector_t ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vmul $vf9, $vf4, $vf5 \n" + "vmul $vf6, $vf1, $vf5 \n" + "vmul $vf7, $vf2, $vf5 \n" + "vmul $vf8, $vf3, $vf5 \n" + "vmulx.w $vf9, $vf0, $vf9 \n" + "vaddy.x $vf6, $vf6, $vf6 \n" + "vaddx.y $vf7, $vf7, $vf7 \n" + "vaddx.z $vf8, $vf8, $vf8 \n" + "vaddy.w $vf9, $vf9, $vf9 \n" + "vaddz.x $vf10, $vf6, $vf6 \n" + "vaddz.y $vf10, $vf7, $vf7 \n" + "vaddy.z $vf10, $vf8, $vf8 \n" + "vaddz.w $vf10, $vf9, $vf9 \n" + "qmfc2 %[result], $vf10 \n" + : [result] "=&r"(result) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val) + : "memory"); return vec_4(result); } vec_4 trans_mult(point_t vec) const { - vec128_t result, temp0, temp1, temp2, temp3; - - asm("### mat_44 trans_mult point_t ### \n" - "vmul %[temp0], %[col0], %[vec] \n" - "vmul %[temp1], %[col1], %[vec] \n" - "vmul %[temp2], %[col2], %[vec] \n" - "vmul %[temp3], %[col3], %[vec] \n" - "vaddy.x %[temp0], %[temp0], %[temp0] \n" - "vaddx.y %[temp1], %[temp1], %[temp1] \n" - "vaddx.z %[temp2], %[temp2], %[temp2] \n" - "vaddx.w %[temp3], %[col3], %[temp3] \n" - "vaddz.x %[temp0], %[temp0], %[temp0] \n" - "vaddz.y %[temp1], %[temp1], %[temp1] \n" - "vaddy.z %[temp2], %[temp2], %[temp2] \n" - "vaddy.w %[temp3], %[temp3], %[temp3] \n" - "vaddw.x %[result], %[temp0], %[col0] \n" - "vaddw.y %[result], %[temp1], %[col1] \n" - "vaddw.z %[result], %[temp2], %[col2] \n" - "vaddz.w %[result], %[temp3], %[temp3] \n" - : [result] "=&j"(result), - [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3), - [vec] "j"(vec)); + vec128_t result; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t vec_val = vec.vec128; + asm __volatile__( + "### mat_44 trans_mult point_t ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vmul $vf6, $vf1, $vf5 \n" + "vmul $vf7, $vf2, $vf5 \n" + "vmul $vf8, $vf3, $vf5 \n" + "vmul $vf9, $vf4, $vf5 \n" + "vaddy.x $vf6, $vf6, $vf6 \n" + "vaddx.y $vf7, $vf7, $vf7 \n" + "vaddx.z $vf8, $vf8, $vf8 \n" + "vaddx.w $vf9, $vf4, $vf9 \n" + "vaddz.x $vf6, $vf6, $vf6 \n" + "vaddz.y $vf7, $vf7, $vf7 \n" + "vaddy.z $vf8, $vf8, $vf8 \n" + "vaddy.w $vf9, $vf9, $vf9 \n" + "vaddw.x $vf10, $vf6, $vf1 \n" + "vaddw.y $vf10, $vf7, $vf2 \n" + "vaddw.z $vf10, $vf8, $vf3 \n" + "vaddz.w $vf10, $vf9, $vf9 \n" + "qmfc2 %[result], $vf10 \n" + : [result] "=&r"(result) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val) + : "memory"); return vec_4(result); } @@ -1255,49 +1370,58 @@ class transform_t { vec_4 get_row0() const { + vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, c3 = col3.vec128; vec128_t row; - asm(" ### transform_t::get_row0 ### \n" - "vaddx.x %[row], vf00, %[col0] \n" - "vaddx.y %[row], vf00, %[col1] \n" - "vaddx.z %[row], vf00, %[col2] \n" - "vmulx.w %[row], vf00, %[col3] \n" - : [row] "=&j"(row) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3)); + asm __volatile__ (" ### transform_t::get_row0 ### \n" + "qmtc2 %[c0], $vf1 \n" + "qmtc2 %[c1], $vf2 \n" + "qmtc2 %[c2], $vf3 \n" + "qmtc2 %[c3], $vf4 \n" + "vaddx.x $vf5, $vf0, $vf1 \n" + "vaddx.y $vf5, $vf0, $vf2 \n" + "vaddx.z $vf5, $vf0, $vf3 \n" + "vmulx.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=r"(row) + : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [c3] "r"(c3)); return vec_4(row); } vec_4 get_row1() const { + vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, c3 = col3.vec128; vec128_t row; - asm(" ### transform_t::get_row1 ### \n" - "vaddy.x %[row], vf00, %[col0] \n" - "vaddy.y %[row], vf00, %[col1] \n" - "vaddy.z %[row], vf00, %[col2] \n" - "vmuly.w %[row], vf00, %[col3] \n" - : [row] "=&j"(row) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3)); + asm __volatile__ (" ### transform_t::get_row1 ### \n" + "qmtc2 %[c0], $vf1 \n" + "qmtc2 %[c1], $vf2 \n" + "qmtc2 %[c2], $vf3 \n" + "qmtc2 %[c3], $vf4 \n" + "vaddy.x $vf5, $vf0, $vf1 \n" + "vaddy.y $vf5, $vf0, $vf2 \n" + "vaddy.z $vf5, $vf0, $vf3 \n" + "vmuly.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=r"(row) + : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [c3] "r"(c3)); return vec_4(row); } vec_4 get_row2() const { vec128_t row; - asm(" ### transform_t::get_row2 ### \n" - "vaddz.x %[row], vf00, %[col0] \n" - "vaddz.y %[row], vf00, %[col1] \n" - "vaddz.z %[row], vf00, %[col2] \n" - "vmulz.w %[row], vf00, %[col3] \n" - : [row] "=&j"(row) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3)); + asm __volatile__ (" ### transform_t::get_row2 ### \n" + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "lqc2 $vf4, 48(%[T]) \n" + "vaddz.x $vf5, $vf0, $vf1 \n" + "vaddz.y $vf5, $vf0, $vf2 \n" + "vaddz.z $vf5, $vf0, $vf3 \n" + "vmulz.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=r"(row) + : [T] "r"(this) + : "memory"); return vec_4(row); } @@ -1327,26 +1451,31 @@ class transform_t { void orthonormal_inverse_in_place() { - vec128_t temp; - asm( + asm __volatile__ ( "### transform_t::orthonormal_inverse_in_place ### \n" - "vadd.xz %[temp], vf00, %[col1] \n" - "vaddx.y %[temp], vf00, %[col2] \n" - "vaddy.x %[col1], vf00, %[col0] \n" - "vaddy.z %[col1], vf00, %[col2] \n" - "vaddz.x %[col2], vf00, %[col0] \n" - "vaddy.z %[col0], vf00, %[temp] \n" - "vaddx.y %[col0], vf00, %[temp] \n" - "vaddz.y %[col2], vf00, %[temp] \n" - "vadda ACC, vf00, vf00 \n" - "vmsubay ACC, %[col1], %[col3] \n" - "vmsubax ACC, %[col0], %[col3] \n" - "vmsubz %[col3], %[col2], %[col3] \n" - : [col0] "+j"(col0), - [col1] "+j"(col1), - [col2] "+j"(col2), - [col3] "+j"(col3), - [temp] "=j"(temp), "=r"(vu0_ACC)); + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "lqc2 $vf4, 48(%[T]) \n" + "vadd.xz $vf5, $vf0, $vf2 \n" + "vaddx.y $vf5, $vf0, $vf3 \n" + "vaddy.x $vf2, $vf0, $vf1 \n" + "vaddy.z $vf2, $vf0, $vf3 \n" + "vaddz.x $vf3, $vf0, $vf1 \n" + "vaddy.z $vf1, $vf0, $vf5 \n" + "vaddx.y $vf1, $vf0, $vf5 \n" + "vaddz.y $vf3, $vf0, $vf5 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmsubay $ACC, $vf2, $vf4 \n" + "vmsubax $ACC, $vf1, $vf4 \n" + "vmsubz $vf4, $vf3, $vf4 \n" + "sqc2 $vf1, 0(%[T]) \n" + "sqc2 $vf2, 16(%[T]) \n" + "sqc2 $vf3, 32(%[T]) \n" + "sqc2 $vf4, 48(%[T]) \n" + : "=r"(vu0_ACC) + : [T] "r"(this) + : "memory"); } // matrix/vector operations @@ -1354,50 +1483,70 @@ class transform_t { vec_4 operator*(vec_4 vec) const { + vec128_t v = vec.vec128; vec128_t result; - asm( + asm __volatile__ ( " ### transform_t * vec_4 ### \n" - "vmulax ACC, %[col0], %[vec] \n" - "vmadday ACC, %[col1], %[vec] \n" - "vmaddaz ACC, %[col2], %[vec] \n" - "vmaddw.xyz %[result], %[col3], %[vec] \n" - "vmove.w %[result], %[vec] \n" - : [result] "=&j"(result), "=r"(vu0_ACC) - : [vec] "j"(vec), - [col0] "j"(col0), [col1] "j"(col1), - [col2] "j"(col2), [col3] "j"(col3)); + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "lqc2 $vf4, 48(%[T]) \n" + "qmtc2 %[v], $vf5 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "qmfc2 %[result], $vf6 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [T] "r"(this), [v] "r"(v) + : "memory"); return vec_4(result); } vector_t operator*(vector_t vec) const { + vec128_t v = vec.vec128; vec128_t result; - asm( + asm __volatile__ ( " ### transform_t * vector_t ### \n" - "vmulax ACC, %[col0], %[vec] \n" - "vmadday ACC, %[col1], %[vec] \n" - "vmaddz %[result], %[col2, %[vec] \n" - : [result] "=&j"(result), "=r"(vu0_ACC) - : [vec] "j"(vec), - [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2)); + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "qmtc2 %[v], $vf4 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf4 \n" + "vmadday $ACC, $vf2, $vf4 \n" + "vmaddz $vf5, $vf3, $vf4 \n" + "qmfc2 %[result], $vf5 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [T] "r"(this), [v] "r"(v) + : "memory"); return vector_t(result); } point_t operator*(point_t pt) const { + vec128_t p = pt.vec128; vec128_t result; - asm( + asm __volatile__ ( " ### transform_t * point_t ### \n" - "vmulax ACC, %[col0], %[pt] \n" - "vmadday ACC, %[col1], %[pt] \n" - "vmaddaz ACC, %[col2], %[pt] \n" - "vmaddw %[result], %[col3], vf00 \n" - : [result] "=&j"(result), "=r"(vu0_ACC) - : [pt] "j"(pt), - [col0] "j"(col0), [col1] "j"(col1), - [col2] "j"(col2), [col3] "j"(col3)); + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "lqc2 $vf4, 48(%[T]) \n" + "qmtc2 %[p], $vf5 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf0 \n" + "qmfc2 %[result], $vf6 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [T] "r"(this), [p] "r"(p) + : "memory"); return point_t(result); } @@ -1440,22 +1589,25 @@ inline mat_33 vec_xyz::operator~() const { mat_33 result; - asm("### make tilde matrix from vec_xyz ### \n" - "vsub %[res0], %[res0], %[res0] \n" - "vsub %[res1], %[res1], %[res1] \n" - "vmr32 %[res2], vf00 \n" - "vaddw.x %[res0], vf00, vf00 \n" - "vaddw.y %[res1], vf00, vf00 \n" - "vopmula ACC, %[vec], %[res0] \n" - "vopmsub %[res0], %[res0], %[vec] \n" - "vopmula ACC, %[vec], %[res1] \n" - "vopmsub %[res1], %[res1], %[vec] \n" - "vopmula ACC, %[vec], %[res2] \n" - "vopmsub %[res2], %[res2], %[vec] \n" - : [res0] "=&j"(result.col0), - [res1] "=&j"(result.col1), - [res2] "=&j"(result.col2), "=r"(vu0_ACC) - : [vec] "j"(vec128)); + vec128_t v = vec128; + asm __volatile__ ("### make tilde matrix from vec_xyz ### \n" + "qmtc2 %[v], $vf4 \n" + "vsub $vf1, $vf0, $vf0 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vmr32 $vf3, $vf0 \n" + "vaddw.x $vf1, $vf0, $vf0 \n" + "vaddw.y $vf2, $vf0, $vf0 \n" + "vopmula $ACC, $vf4, $vf1 \n" + "vopmsub $vf1, $vf1, $vf4 \n" + "vopmula $ACC, $vf4, $vf2 \n" + "vopmsub $vf2, $vf2, $vf4 \n" + "vopmula $ACC, $vf4, $vf3 \n" + "vopmsub $vf3, $vf3, $vf4 \n" + "qmfc2 %[r0], $vf1 \n" + "qmfc2 %[r1], $vf2 \n" + "qmfc2 %[r2], $vf3 \n" + : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC) + : [v] "r"(v)); return result; } @@ -1483,68 +1635,70 @@ vec_xyz::tilde_mult(const mat_34& mat) const inline vec_xyz vec_xyz::operator*(const mat_33& mat) const { - vec128_t ones, temp0, temp1, temp2, result; - - asm("### vec_xyz (row) * mat_33 ## \n" - "vmul %[temp0], %[vec], %[col0] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[vec], %[col1] \n" - "vmul %[temp2], %[vec], %[col2] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddz.x %[res], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddz.y %[res], %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmaddy.z %[res], %[ones], %[temp2] \n" - : [res] "=&j"(result), [ones] "=&j"(ones), - [temp0] "=&j"(temp0), - [temp1] "=&j"(temp1), - [temp2] "=&j"(temp2), "=r"(vu0_ACC) - : [col0] "j"(mat.col0), - [col1] "j"(mat.col1), - [col2] "j"(mat.col2), - [vec] "j"(vec128)); + vec128_t result; + vec128_t col0_val = mat.col0.vec128; + vec128_t col1_val = mat.col1.vec128; + vec128_t col2_val = mat.col2.vec128; + vec128_t vec_val = vec128; + asm __volatile__( + "### vec_xyz (row) * mat_33 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vmaxw $vf5, $vf0, $vf0 \n" + "vmul $vf6, $vf4, $vf1 \n" + "vmul $vf7, $vf4, $vf2 \n" + "vmul $vf8, $vf4, $vf3 \n" + "vadday.x $ACC, $vf6, $vf6 \n" + "vmaddz.x $vf9, $vf5, $vf6 \n" + "vaddax.y $ACC, $vf7, $vf7 \n" + "vmaddz.y $vf9, $vf5, $vf7 \n" + "vaddax.z $ACC, $vf8, $vf8 \n" + "vmaddy.z $vf9, $vf5, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [vec_val] "r"(vec_val) + : "memory"); return vec_xyz(result); } inline vec_xyzw vec_xyz::operator*(const mat_34& mat) const { - vec128_t ones, temp0, temp1, temp2, temp3, result; - - asm("### vec_xyz (row) * mat_34 ## \n" - "vmul %[temp0], %[vec], %[col0] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[vec], %[col1] \n" - "vmul %[temp2], %[vec], %[col2] \n" - "vmul %[temp3], %[vec], %[col3] \n" - : [ones] "=&j"(ones), - [temp0] "=&j"(temp0), - [temp1] "=&j"(temp1), - [temp2] "=&j"(temp2), - [temp3] "=&j"(temp3) - : [col0] "j"(mat.col0), - [col1] "j"(mat.col1), - [col2] "j"(mat.col2), - [col3] "j"(mat.col3), - [vec] "j"(vec128)); - - asm("vmulx.w %[temp3], vf00, %[temp3] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddz.x %[res], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddz.y %[res], %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmaddy.z %[res], %[ones], %[temp2] \n" - "vadday.w ACC, %[temp3], %[temp3] \n" - "vmaddz.w %[res], %[ones], %[temp3] \n" - : [res] "=&j"(result), "=r"(vu0_ACC) - : [ones] "j"(ones), - [temp0] "j"(temp0), - [temp1] "j"(temp1), - [temp2] "j"(temp2), - [temp3] "j"(temp3)); - + vec128_t result; + vec128_t col0_val = mat.col0.vec128; + vec128_t col1_val = mat.col1.vec128; + vec128_t col2_val = mat.col2.vec128; + vec128_t col3_val = mat.col3.vec128; + vec128_t vec_val = vec128; + asm __volatile__( + "### vec_xyz (row) * mat_34 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vmaxw $vf6, $vf0, $vf0 \n" + "vmul $vf7, $vf5, $vf1 \n" + "vmul $vf8, $vf5, $vf2 \n" + "vmul $vf9, $vf5, $vf3 \n" + "vmul $vf10, $vf5, $vf4 \n" + "vmulx.w $vf10, $vf0, $vf10 \n" + "vadday.x $ACC, $vf7, $vf7 \n" + "vmaddz.x $vf11, $vf6, $vf7 \n" + "vaddax.y $ACC, $vf8, $vf8 \n" + "vmaddz.y $vf11, $vf6, $vf8 \n" + "vaddax.z $ACC, $vf9, $vf9 \n" + "vmaddy.z $vf11, $vf6, $vf9 \n" + "vadday.w $ACC, $vf10, $vf10 \n" + "vmaddz.w $vf11, $vf6, $vf10 \n" + "qmfc2 %[result], $vf11 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val) + : "memory"); return vec_xyzw(result); } @@ -1573,75 +1727,76 @@ vec_xyz::tensor_mult(const vec_4 vec) const inline vec_xyz vec_xyzw::operator*(const mat_43& mat) const { - vec128_t ones, temp0, temp1, temp2, result; - - asm("### vec_xyzw (row) * mat_43 ## \n" - "vmul %[temp0], %[vec], %[col0] \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp1], %[vec], %[col1] \n" - "vmul %[temp2], %[vec], %[col2] \n" - "vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddaz.x ACC, %[ones], %[temp0] \n" - "vmaddw.x %[res], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddaz.y ACC, %[ones], %[temp1] \n" - "vmaddw.y %[res], %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmadday.z ACC, %[ones], %[temp2] \n" - "vmaddw.z %[res], %[ones], %[temp2] \n" - : [res] "=&j"(result), [ones] "=&j"(ones), - [temp0] "=&j"(temp0), - [temp1] "=&j"(temp1), - [temp2] "=&j"(temp2), "=r"(vu0_ACC) - : [col0] "j"(mat.col0), - [col1] "j"(mat.col1), - [col2] "j"(mat.col2), - [vec] "j"(vec128)); - + vec128_t result; + vec128_t col0_val = mat.col0.vec128; + vec128_t col1_val = mat.col1.vec128; + vec128_t col2_val = mat.col2.vec128; + vec128_t vec_val = vec128; + asm __volatile__( + "### vec_xyzw (row) * mat_43 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vmaxw $vf5, $vf0, $vf0 \n" + "vmul $vf6, $vf4, $vf1 \n" + "vmul $vf7, $vf4, $vf2 \n" + "vmul $vf8, $vf4, $vf3 \n" + "vadday.x $ACC, $vf6, $vf6 \n" + "vmaddaz.x $ACC, $vf5, $vf6 \n" + "vmaddw.x $vf9, $vf5, $vf6 \n" + "vaddax.y $ACC, $vf7, $vf7 \n" + "vmaddaz.y $ACC, $vf5, $vf7 \n" + "vmaddw.y $vf9, $vf5, $vf7 \n" + "vaddax.z $ACC, $vf8, $vf8 \n" + "vmadday.z $ACC, $vf5, $vf8 \n" + "vmaddw.z $vf9, $vf5, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [vec_val] "r"(vec_val) + : "memory"); return vec_xyz(result); } inline vec_xyzw vec_xyzw::operator*(const mat_44& mat) const { - vec128_t ones, temp0, temp1, temp2, temp3, result; - - asm("### vec_xyzw (row) * mat_44 ## \n" - "vmaxw %[ones], vf00, vf00 \n" - "vmul %[temp0], %[vec], %[col0] \n" - "vmul %[temp1], %[vec], %[col1] \n" - "vmul %[temp2], %[vec], %[col2] \n" - "vmul %[temp3], %[vec], %[col3] \n" - : [ones] "=&j"(ones), - [temp0] "=&j"(temp0), - [temp1] "=&j"(temp1), - [temp2] "=&j"(temp2), - [temp3] "=&j"(temp3) - : [col0] "j"(mat.col0), - [col1] "j"(mat.col1), - [col2] "j"(mat.col2), - [col3] "j"(mat.col3), - [vec] "j"(vec128)); - - asm("vadday.x ACC, %[temp0], %[temp0] \n" - "vmaddaz.x ACC, %[ones], %[temp0] \n" - "vmaddw.x %[res], %[ones], %[temp0] \n" - "vaddax.y ACC, %[temp1], %[temp1] \n" - "vmaddaz.y ACC, %[ones], %[temp1] \n" - "vmaddw.y %[res], %[ones], %[temp1] \n" - "vaddax.z ACC, %[temp2], %[temp2] \n" - "vmadday.z ACC, %[ones], %[temp2] \n" - "vmaddw.z %[res], %[ones], %[temp2] \n" - "vaddax.w ACC, %[temp3], %[temp3] \n" - "vmadday.w ACC, %[ones], %[temp3] \n" - "vmaddz.w %[res], %[ones], %[temp3] \n" - : [res] "=&j"(result), "=r"(vu0_ACC) - : [ones] "j"(ones), - [temp0] "j"(temp0), - [temp1] "j"(temp1), - [temp2] "j"(temp2), - [temp3] "j"(temp3)); - + vec128_t result; + vec128_t col0_val = mat.col0.vec128; + vec128_t col1_val = mat.col1.vec128; + vec128_t col2_val = mat.col2.vec128; + vec128_t col3_val = mat.col3.vec128; + vec128_t vec_val = vec128; + asm __volatile__( + "### vec_xyzw (row) * mat_44 ### \n" + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vmaxw $vf6, $vf0, $vf0 \n" + "vmul $vf7, $vf5, $vf1 \n" + "vmul $vf8, $vf5, $vf2 \n" + "vmul $vf9, $vf5, $vf3 \n" + "vmul $vf10, $vf5, $vf4 \n" + "vadday.x $ACC, $vf7, $vf7 \n" + "vmaddaz.x $ACC, $vf6, $vf7 \n" + "vmaddw.x $vf11, $vf6, $vf7 \n" + "vaddax.y $ACC, $vf8, $vf8 \n" + "vmaddaz.y $ACC, $vf6, $vf8 \n" + "vmaddw.y $vf11, $vf6, $vf8 \n" + "vaddax.z $ACC, $vf9, $vf9 \n" + "vmadday.z $ACC, $vf6, $vf9 \n" + "vmaddw.z $vf11, $vf6, $vf9 \n" + "vaddax.w $ACC, $vf10, $vf10 \n" + "vmadday.w $ACC, $vf6, $vf10 \n" + "vmaddz.w $vf11, $vf6, $vf10 \n" + "qmfc2 %[result], $vf11 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), + [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val) + : "memory"); return vec_xyzw(result); } @@ -1742,43 +1897,40 @@ inline mat_33 mat_33::inverse() const { mat_33 result; - vec128_t temp, determinant; - - asm("### mat_33::inverse ### \n" - - "vopmula.xyz ACC, %[col0], %[col1] # inv2 = crossproduct(col0, col1) \n" - "vopmsub.xyz %[inv2], %[col1], %[col0] \n" - "vopmula.xyz ACC, %[col1], %[col2] # inv0 = crossproduct(col1, col2) \n" - "vopmsub.xyz %[inv0], %[col2], %[col1] \n" - " # stall \n" - "vmul %[determinant], %[col2], %[inv2] # determinant(R) = dotproduct(col2, inv2) \n" - "vaddw.x %[temp], vf00, vf00 \n" - "vopmula.xyz ACC, %[col2], %[col0] # inv1 = crossproduct(col2, col0) \n" - "vopmsub.xyz %[inv1], %[col0], %[col2] \n" - "vadday.x ACCx, %[determinant], %[determinant] \n" - "vmaddz.x %[determinant], %[temp], %[determinant] \n" - - "vaddx.y %[temp], vf00], %[inv2] # Do an in-place transpose, produces determinant(R)*Rinv \n" - "vadd.xz %[temp], vf00], %[inv1] \n" - "vaddy.x %[inv1], vf00], %[inv0] \n" - "vdiv Q, vf00w, %[determinantx] # Q = 1/determinant(R) \n" - "vaddy.z %[inv1], vf00, %[inv2] \n" - "vaddz.x %[inv2], vf00, %[inv0] \n" - "vaddy.z %[inv0], vf00, %[temp] \n" - "vaddx.y %[inv0], vf00, %[temp] \n" - "vaddz.y %[inv2], vf00, %[temp] \n" - "vwaitq \n" - "vmulq.xyz %[inv1], %[inv1], Q \n" - "vmulq.xyz %[inv0], %[inv0], Q # Multiply by 1/determinant(R) \n" - "vmulq.xyz %[inv2], %[inv2], Q \n" - - : [inv0] "=&j"(result.col0), - [inv1] "=&j"(result.col1), - [inv2] "=&j"(result.col2), - [temp] "=&j"(temp), [determinant] "=&j"(determinant), "=r"(vu0_ACC) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2)); + vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128; + + asm __volatile__ ("### mat_33::inverse ### \n" + "qmtc2 %[c0], $vf1 \n" + "qmtc2 %[c1], $vf2 \n" + "qmtc2 %[c2], $vf3 \n" + "vopmula.xyz $ACC, $vf1, $vf2 # inv2 = col0 x col1 \n" + "vopmsub.xyz $vf6, $vf2, $vf1 \n" + "vopmula.xyz $ACC, $vf2, $vf3 # inv0 = col1 x col2 \n" + "vopmsub.xyz $vf4, $vf3, $vf2 \n" + "vmul $vf9, $vf3, $vf6 # det components \n" + "vaddw.x $vf8, $vf0, $vf0 # temp.x = 1.0 \n" + "vopmula.xyz $ACC, $vf3, $vf1 # inv1 = col2 x col0 \n" + "vopmsub.xyz $vf5, $vf1, $vf3 \n" + "vadday.x $ACC, $vf9, $vf9 \n" + "vmaddz.x $vf9, $vf8, $vf9 # det = det.x+det.y+det.z \n" + "vaddx.y $vf8, $vf0, $vf6 # transpose \n" + "vadd.xz $vf8, $vf0, $vf5 \n" + "vaddy.x $vf5, $vf0, $vf4 \n" + "vdiv $Q, $vf0w, $vf9x # Q = 1/det \n" + "vaddy.z $vf5, $vf0, $vf6 \n" + "vaddz.x $vf6, $vf0, $vf4 \n" + "vaddy.z $vf4, $vf0, $vf8 \n" + "vaddx.y $vf4, $vf0, $vf8 \n" + "vaddz.y $vf6, $vf0, $vf8 \n" + "vwaitq \n" + "vmulq.xyz $vf5, $vf5, $Q \n" + "vmulq.xyz $vf4, $vf4, $Q # scale by 1/det \n" + "vmulq.xyz $vf6, $vf6, $Q \n" + "qmfc2 %[r0], $vf4 \n" + "qmfc2 %[r1], $vf5 \n" + "qmfc2 %[r2], $vf6 \n" + : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC) + : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2)); return result; } @@ -1842,15 +1994,23 @@ inline mat_33 mat_33::mult_tilde(vec_3 vec) const { mat_33 result; - asm("### mat_33 mult_tilde vec_3 ### \n" - "vmulaz ACC, %[col1], %[vec] \n" - "vmsuby %[res0], %[col2, %[vec] \n" - "vmulax ACC, %[col2, %[vec] \n" - "vmsubz %[res1], %[col0], %[vec] \n" - "vmulay ACC, %[col0, %[vec] \n" - "vmsubx %[res2], %[col1], %[vec] \n" - : [res0] "=&j"(result.col0), [res1] "=&j"(result.col1), [res2] "=&j"(result.col2), "=r"(vu0_ACC) - : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec)); + vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, v = vec.vec128; + asm __volatile__ ("### mat_33 mult_tilde vec_3 ### \n" + "qmtc2 %[c0], $vf1 \n" + "qmtc2 %[c1], $vf2 \n" + "qmtc2 %[c2], $vf3 \n" + "qmtc2 %[v], $vf4 \n" + "vmulaz $ACC, $vf2, $vf4 \n" + "vmsuby $vf5, $vf3, $vf4 \n" + "vmulax $ACC, $vf3, $vf4 \n" + "vmsubz $vf6, $vf1, $vf4 \n" + "vmulay $ACC, $vf1, $vf4 \n" + "vmsubx $vf7, $vf2, $vf4 \n" + "qmfc2 %[r0], $vf5 \n" + "qmfc2 %[r1], $vf6 \n" + "qmfc2 %[r2], $vf7 \n" + : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC) + : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [v] "r"(v)); return result; } @@ -2287,15 +2447,23 @@ inline mat_43 mat_43::mult_tilde(vec_3 vec) const { mat_43 result; - asm("### mat_43 mult_tilde vec_3 ### \n" - "vmulaz ACC, %[col1], %[vec] \n" - "vmsuby %[res0], %[col2], %[vec] \n" - "vmulax ACC, %[col2], %[vec] \n" - "vmsubz %[res1], %[col0], %[vec] \n" - "vmulay ACC, %[col0], %[vec] \n" - "vmsubx %[res2], %[col1], %[vec] \n" - : [res0] "=&j"(result.col0), [res1] "=&j"(result.col1), [res2] "=&j"(result.col2), "=r"(vu0_ACC) - : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec)); + vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, v = vec.vec128; + asm __volatile__ ("### mat_43 mult_tilde vec_3 ### \n" + "qmtc2 %[c0], $vf1 \n" + "qmtc2 %[c1], $vf2 \n" + "qmtc2 %[c2], $vf3 \n" + "qmtc2 %[v], $vf4 \n" + "vmulaz $ACC, $vf2, $vf4 \n" + "vmsuby $vf5, $vf3, $vf4 \n" + "vmulax $ACC, $vf3, $vf4 \n" + "vmsubz $vf6, $vf1, $vf4 \n" + "vmulay $ACC, $vf1, $vf4 \n" + "vmsubx $vf7, $vf2, $vf4 \n" + "qmfc2 %[r0], $vf5 \n" + "qmfc2 %[r1], $vf6 \n" + "qmfc2 %[r2], $vf7 \n" + : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC) + : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [v] "r"(v)); return result; } @@ -2620,56 +2788,47 @@ inline transform_t transform_t::inverse() const { transform_t result; - vec128_t temp, determinant; - - asm("### transform_t::inverse, part I ### \n" - "vopmula.xyz ACC, %[col0], %[col1] # inv2 = crossproduct(col0, col1) \n" - "vopmsub.xyz %[inv2], %[col1], %[col0] \n" - "vopmula.xyz ACC, %[col1], %[col2] # inv0 = crossproduct(col1, col2) \n" - "vopmsub.xyz %[inv0], %[col2], %[col1] \n" - " # stall \n" - "vmul %[determinant], %[col2], %[inv2] # determinant(R) = dotproduct(col2, inv2) \n" - "vaddw.x %[temp], vf00, vf00 \n" - "vopmula.xyz ACC, %[col2], %[col0] # inv1 = crossproduct(col2, col0) \n" - "vopmsub.xyz %[inv1], %[col0], %[col2] \n" - "vadday.x ACC, %[determinant], %[determinant] \n" - "vmaddz.x %[determinant], %[temp], %[determinant] \n" - - "vaddx.y %[temp], vf00, %[inv2] # Do an in-place transpose, produces determinant(R)*Rinv \n" - "vadd.xz %[temp], vf00, %[inv1] \n" - "vaddy.x %[inv1], vf00, %[inv0] \n" - "vdiv Q, vf00w, %[determinant]x # Q = 1/determinant(R) \n" - "vaddy.z %[inv1], vf00, %[inv2] \n" - "vaddz.x %[inv2], vf00, %[inv0] \n" - "vaddy.z %[inv0], vf00, %[temp] \n" - "vaddx.y %[inv0], vf00, %[temp] \n" - "vaddz.y %[inv2], vf00, %[temp] \n" - : [inv0] "=&j"(result.col0), - [inv1] "=&j"(result.col1), - [inv2] "=&j"(result.col2), - [temp] "=&j"(temp), - [determinant] "=&j"(determinant), "=r"(vu0_ACC), "=j"(vu0_Q) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2)); - - asm("### transform_t::inverse, part II ### \n" - - "vadda ACC, vf00, vf00 # Compute determinant(R)*Rinv -T \n" - "vmsubay ACC, %[inv1], %[col3] \n" - "vmsubax ACC, %[inv0], %[col3] \n" - "vmsubz %[inv3], %[inv2], %[col3] \n" - - "vmulq.xyz %[inv0], %[inv0], Q # Multiply by 1/determinant(R) \n" - "vmulq.xyz %[inv1], %[inv1], Q \n" - "vmulq.xyz %[inv2], %[inv2], Q \n" - "vmulq.xyz %[inv3], %[inv3], Q \n" - : [inv0] "+j"(result.col0), - [inv1] "+j"(result.col1), - [inv2] "+j"(result.col2), - [inv3] "=&j"(result.col3), - "=r"(vu0_ACC) - : [col3] "j"(col3), "j"(vu0_Q)); + + asm __volatile__ ("### transform_t::inverse ### \n" + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "lqc2 $vf4, 48(%[T]) \n" + "vopmula.xyz $ACC, $vf1, $vf2 # inv2 = col0 x col1 \n" + "vopmsub.xyz $vf7, $vf2, $vf1 \n" + "vopmula.xyz $ACC, $vf2, $vf3 # inv0 = col1 x col2 \n" + "vopmsub.xyz $vf5, $vf3, $vf2 \n" + "vmul $vf9, $vf3, $vf7 # det components \n" + "vaddw.x $vf8, $vf0, $vf0 # temp.x = 1.0 \n" + "vopmula.xyz $ACC, $vf3, $vf1 # inv1 = col2 x col0 \n" + "vopmsub.xyz $vf6, $vf1, $vf3 \n" + "vadday.x $ACC, $vf9, $vf9 \n" + "vmaddz.x $vf9, $vf8, $vf9 # det = det.x+det.y+det.z \n" + "vaddx.y $vf8, $vf0, $vf7 # transpose \n" + "vadd.xz $vf8, $vf0, $vf6 \n" + "vaddy.x $vf6, $vf0, $vf5 \n" + "vdiv $Q, $vf0w, $vf9x # Q = 1/det \n" + "vaddy.z $vf6, $vf0, $vf7 \n" + "vaddz.x $vf7, $vf0, $vf5 \n" + "vaddy.z $vf5, $vf0, $vf8 \n" + "vaddx.y $vf5, $vf0, $vf8 \n" + "vaddz.y $vf7, $vf0, $vf8 \n" + "vsuba $ACC, $vf0, $vf0 # compute -R^T*t \n" + "vmsubay $ACC, $vf6, $vf4 \n" + "vmsubax $ACC, $vf5, $vf4 \n" + "vmsubz $vf9, $vf7, $vf4 \n" + "vwaitq \n" + "vmulq.xyz $vf5, $vf5, $Q # scale by 1/det \n" + "vmulq.xyz $vf6, $vf6, $Q \n" + "vmulq.xyz $vf7, $vf7, $Q \n" + "vmulq.xyz $vf9, $vf9, $Q \n" + "sqc2 $vf5, 0(%[R]) \n" + "sqc2 $vf6, 16(%[R]) \n" + "sqc2 $vf7, 32(%[R]) \n" + "sqc2 $vf9, 48(%[R]) \n" + : "=r"(vu0_ACC) + : [T] "r"(this), [R] "r"(&result) + : "memory"); return result; } @@ -2677,28 +2836,31 @@ inline transform_t transform_t::orthonormal_inverse() const { transform_t result; - asm("### transform_t::orthonormal_inverse ### \n" - "vadd.x %[inv0], vf00, %[col0] \n" - "vadd.y %[inv1], vf00, %[col1] \n" - "vadd.z %[inv2], vf00, %[col2] \n" - "vaddx.y %[inv0], vf00, %[col1] \n" - "vaddy.x %[inv1], vf00, %[col0] \n" - "vaddz.x %[inv2], vf00, %[col0] \n" - "vaddx.z %[inv0], vf00, %[col2] \n" - "vaddy.z %[inv1], vf00, %[col2] \n" - "vaddz.y %[inv2], vf00, %[col1] \n" - "vadda ACC, vf00, vf00 \n" - "vmsubax ACC, %[inv0], %[col3] \n" - "vmsubay ACC, %[inv1], %[col3] \n" - "vmsubz %[inv3], %[inv2], %[col3] \n" - : [inv0] "=&j"(result.col0), - [inv1] "=&j"(result.col1), - [inv2] "=&j"(result.col2), - [inv3] "=&j"(result.col3), "=r"(vu0_ACC) - : [col0] "j"(col0), - [col1] "j"(col1), - [col2] "j"(col2), - [col3] "j"(col3)); + asm __volatile__ ("### transform_t::orthonormal_inverse ### \n" + "lqc2 $vf1, 0(%[T]) \n" + "lqc2 $vf2, 16(%[T]) \n" + "lqc2 $vf3, 32(%[T]) \n" + "lqc2 $vf4, 48(%[T]) \n" + "vadd.x $vf5, $vf0, $vf1 \n" + "vadd.y $vf6, $vf0, $vf2 \n" + "vadd.z $vf7, $vf0, $vf3 \n" + "vaddx.y $vf5, $vf0, $vf2 \n" + "vaddy.x $vf6, $vf0, $vf1 \n" + "vaddz.x $vf7, $vf0, $vf1 \n" + "vaddx.z $vf5, $vf0, $vf3 \n" + "vaddy.z $vf6, $vf0, $vf3 \n" + "vaddz.y $vf7, $vf0, $vf2 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmsubax $ACC, $vf5, $vf4 \n" + "vmsubay $ACC, $vf6, $vf4 \n" + "vmsubz $vf8, $vf7, $vf4 \n" + "sqc2 $vf5, 0(%[R]) \n" + "sqc2 $vf6, 16(%[R]) \n" + "sqc2 $vf7, 32(%[R]) \n" + "sqc2 $vf8, 48(%[R]) \n" + : "=r"(vu0_ACC) + : [T] "r"(this), [R] "r"(&result) + : "memory"); return result; } diff --git a/include/ps2s/matrix_common.h b/include/ps2s/matrix_common.h index dcdb487..10432fa 100644 --- a/include/ps2s/matrix_common.h +++ b/include/ps2s/matrix_common.h @@ -34,14 +34,16 @@ class mat_x3_template { void set_zero() { - asm( - " ### mat_x3::set_zero ### \n" - "vsub col0, col0, col0 \n" - "vsub col1, col1, col1 \n" - "vsub col2, col2, col2 \n" - : "=j col0"(col0.vec128), - "=j col1"(col1.vec128), - "=j col2"(col2.vec128)); + asm __volatile__( + "vsub $vf1, $vf0, $vf0 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vsub $vf3, $vf0, $vf0 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128) + : + : "memory"); } void set_col0(column_type new_col) { col0 = new_col; } @@ -50,54 +52,78 @@ class mat_x3_template { void set_row0(vec_3 new_row) { - asm( - " ### mat_x3::set_row0 ### \n" - "vaddx.x col0, vf00, new_row \n" - "vaddy.x col1, vf00, new_row \n" - "vaddz.x col2, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[new_row_val], $vf4 \n" + "vaddx.x $vf1, $vf0, $vf4 \n" + "vaddy.x $vf2, $vf0, $vf4 \n" + "vaddz.x $vf3, $vf0, $vf4 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } void set_row1(vec_3 new_row) { - asm( - " ### mat_x3::set_row1 ### \n" - "vaddx.y col0, vf00, new_row \n" - "vaddy.y col1, vf00, new_row \n" - "vaddz.y col2, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[new_row_val], $vf4 \n" + "vaddx.y $vf1, $vf0, $vf4 \n" + "vaddy.y $vf2, $vf0, $vf4 \n" + "vaddz.y $vf3, $vf0, $vf4 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } void set_row2(vec_3 new_row) { - asm( - " ### mat_x3::set_row2 ### \n" - "vaddx.z col0, vf00, new_row \n" - "vaddy.z col1, vf00, new_row \n" - "vaddz.z col2, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[new_row_val], $vf4 \n" + "vaddx.z $vf1, $vf0, $vf4 \n" + "vaddy.z $vf2, $vf0, $vf4 \n" + "vaddz.z $vf3, $vf0, $vf4 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } void set_row3(vec_3 new_row) { - asm( - " ### mat_x3::set_row3 ### \n" - "vmulx.w col0, vf00, new_row \n" - "vmuly.w col1, vf00, new_row \n" - "vmulz.w col2, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[new_row_val], $vf4 \n" + "vmulx.w $vf1, $vf0, $vf4 \n" + "vmuly.w $vf2, $vf0, $vf4 \n" + "vmulz.w $vf3, $vf0, $vf4 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } column_type get_col0() const { return col0; } @@ -107,60 +133,84 @@ class mat_x3_template { vec_3 get_row0() const { vec128_t row; - asm( - " ### mat_x3::get_row0 ### \n" - "vaddx.x row_, vf00, col0 \n" - "vaddx.y row_, vf00, col1 \n" - "vaddx.z row_, vf00, col2 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "vsub $vf4, $vf0, $vf0 \n" + "vaddx.x $vf4, $vf0, $vf1 \n" + "vaddx.y $vf4, $vf0, $vf2 \n" + "vaddx.z $vf4, $vf0, $vf3 \n" + "qmfc2 %[row], $vf4 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val) + : "memory"); return vec_3(row); } vec_3 get_row1() const { vec128_t row; - asm( - " ### mat_x3::get_row1 ### \n" - "vaddy.x row_, vf00, col0 \n" - "vaddy.y row_, vf00, col1 \n" - "vaddy.z row_, vf00, col2 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "vsub $vf4, $vf0, $vf0 \n" + "vaddy.x $vf4, $vf0, $vf1 \n" + "vaddy.y $vf4, $vf0, $vf2 \n" + "vaddy.z $vf4, $vf0, $vf3 \n" + "qmfc2 %[row], $vf4 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val) + : "memory"); return vec_3(row); } vec_3 get_row2() const { vec128_t row; - asm( - " ### mat_x3::get_row2 ### \n" - "vaddz.x row_, vf00, col0 \n" - "vaddz.y row_, vf00, col1 \n" - "vaddz.z row_, vf00, col2 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "vsub $vf4, $vf0, $vf0 \n" + "vaddz.x $vf4, $vf0, $vf1 \n" + "vaddz.y $vf4, $vf0, $vf2 \n" + "vaddz.z $vf4, $vf0, $vf3 \n" + "qmfc2 %[row], $vf4 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val) + : "memory"); return vec_3(row); } vec_3 get_row3() const { vec128_t row; - asm( - " ### mat_x3::get_row3 ### \n" - "vaddw.x row_, vf00, col0 \n" - "vaddw.y row_, vf00, col1 \n" - "vaddw.z row_, vf00, col2 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "vsub $vf4, $vf0, $vf0 \n" + "vaddw.x $vf4, $vf0, $vf1 \n" + "vaddw.y $vf4, $vf0, $vf2 \n" + "vaddw.z $vf4, $vf0, $vf3 \n" + "qmfc2 %[row], $vf4 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val) + : "memory"); return vec_3(row); } @@ -168,14 +218,23 @@ class mat_x3_template { operator*(vec_3 vec) const { vec128_t result; - asm( - " ### mat_x3 * vec_3 ### \n" - "vmulax ACC, col0, vec \n" - "vmadday ACC, col1, vec \n" - "vmaddz result, col2, vec \n" - : "=&j result"(result), "=r"(vu0_ACC) - : "j vec"(vec.vec128), - "j col0"(col0.vec128), "j col1"(col1.vec128), "j col2"(col2.vec128)); + vec128_t vec_val = vec.vec128; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf4 \n" + "vmadday $ACC, $vf2, $vf4 \n" + "vmaddz $vf5, $vf3, $vf4 \n" + "qmfc2 %[result], $vf5 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [vec_val] "r"(vec_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val) + : "memory"); return column_type(result); } @@ -184,17 +243,28 @@ class mat_x3_template { mult_trans_col0(const mat_x3_template& mat) const { vec128_t result; - asm("### mat_x3 mult_trans mat_x3, column 0 of result ### \n" - "vmulax ACC, col0, mat0 \n" - "vmaddax ACC, col1, mat1 \n" - "vmaddx res0, col2, mat2 \n" - : "=&j res0"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[mat0_val], $vf4 \n" + "qmtc2 %[mat1_val], $vf5 \n" + "qmtc2 %[mat2_val], $vf6 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf4 \n" + "vmaddax $ACC, $vf2, $vf5 \n" + "vmaddx $vf7, $vf3, $vf6 \n" + "qmfc2 %[result], $vf7 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val) + : "memory"); return column_type(result); } @@ -203,17 +273,28 @@ class mat_x3_template { mult_trans_col1(const mat_x3_template& mat) const { vec128_t result; - asm("### mat_x3 mult_trans mat_x3, column 1 of result ### \n" - "vmulay ACC, col0, mat0 \n" - "vmadday ACC, col1, mat1 \n" - "vmaddy res1, col2, mat2 \n" - : "=&j res1"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[mat0_val], $vf4 \n" + "qmtc2 %[mat1_val], $vf5 \n" + "qmtc2 %[mat2_val], $vf6 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulay $ACC, $vf1, $vf4 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddy $vf7, $vf3, $vf6 \n" + "qmfc2 %[result], $vf7 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val) + : "memory"); return column_type(result); } @@ -222,17 +303,28 @@ class mat_x3_template { mult_trans_col2(const mat_x3_template& mat) const { vec128_t result; - asm("### mat_x3 mult_trans mat_x3, column 2 of result ### \n" - "vmulaz ACC, col0, mat0 \n" - "vmaddaz ACC, col1, mat1 \n" - "vmaddz res2, col2, mat2 \n" - : "=&j res2"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[mat0_val], $vf4 \n" + "qmtc2 %[mat1_val], $vf5 \n" + "qmtc2 %[mat2_val], $vf6 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulaz $ACC, $vf1, $vf4 \n" + "vmaddaz $ACC, $vf2, $vf5 \n" + "vmaddz $vf7, $vf3, $vf6 \n" + "qmfc2 %[result], $vf7 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val) + : "memory"); return column_type(result); } @@ -241,17 +333,28 @@ class mat_x3_template { mult_trans_col3(const mat_x3_template& mat) const { vec128_t result; - asm("### mat_x3 mult_trans mat_x3, column 3 of result ### \n" - "vmulaw ACC, col0, mat0 \n" - "vmaddaw ACC, col1, mat1 \n" - "vmaddw res3, col2, mat2 \n" - : "=&j res3"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[mat0_val], $vf4 \n" + "qmtc2 %[mat1_val], $vf5 \n" + "qmtc2 %[mat2_val], $vf6 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulaw $ACC, $vf1, $vf4 \n" + "vmaddaw $ACC, $vf2, $vf5 \n" + "vmaddw $vf7, $vf3, $vf6 \n" + "qmfc2 %[result], $vf7 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val) + : "memory"); return column_type(result); } }; @@ -284,16 +387,18 @@ class mat_x4_template { void set_zero() { - asm( - " ### mat_x4::set_zero ### \n" - "vsub col0, col0, col0 \n" - "vsub col1, col1, col1 \n" - "vsub col2, col2, col2 \n" - "vsub col3, col3, col3 \n" - : "=j col0"(col0.vec128), - "=j col1"(col1.vec128), - "=j col2"(col2.vec128), - "=j col3"(col3.vec128)); + asm __volatile__( + "vsub $vf1, $vf0, $vf0 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vsub $vf3, $vf0, $vf0 \n" + "vsub $vf4, $vf0, $vf0 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128) + : + : "memory"); } void set_col0(column_type new_col) { col0 = new_col; } @@ -303,62 +408,90 @@ class mat_x4_template { void set_row0(vec_4 new_row) { - asm( - " ### mat_x4::set_row0 ### \n" - "vaddx.x col0, vf00, new_row \n" - "vaddy.x col1, vf00, new_row \n" - "vaddz.x col2, vf00, new_row \n" - "vaddw.x col3, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128), - "+j col3"(col3.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[col3], $vf4 \n" + "qmtc2 %[new_row_val], $vf5 \n" + "vaddx.x $vf1, $vf0, $vf5 \n" + "vaddy.x $vf2, $vf0, $vf5 \n" + "vaddz.x $vf3, $vf0, $vf5 \n" + "vaddw.x $vf4, $vf0, $vf5 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } void set_row1(vec_4 new_row) { - asm( - " ### mat_x4::set_row1 ### \n" - "vaddx.y col0, vf00, new_row \n" - "vaddy.y col1, vf00, new_row \n" - "vaddz.y col2, vf00, new_row \n" - "vaddw.y col3, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128), - "+j col3"(col3.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[col3], $vf4 \n" + "qmtc2 %[new_row_val], $vf5 \n" + "vaddx.y $vf1, $vf0, $vf5 \n" + "vaddy.y $vf2, $vf0, $vf5 \n" + "vaddz.y $vf3, $vf0, $vf5 \n" + "vaddw.y $vf4, $vf0, $vf5 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } void set_row2(vec_4 new_row) { - asm( - " ### mat_x4::set_row2 ### \n" - "vaddx.z col0, vf00, new_row \n" - "vaddy.z col1, vf00, new_row \n" - "vaddz.z col2, vf00, new_row \n" - "vaddw.z col3, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128), - "+j col3"(col3.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[col3], $vf4 \n" + "qmtc2 %[new_row_val], $vf5 \n" + "vaddx.z $vf1, $vf0, $vf5 \n" + "vaddy.z $vf2, $vf0, $vf5 \n" + "vaddz.z $vf3, $vf0, $vf5 \n" + "vaddw.z $vf4, $vf0, $vf5 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } void set_row3(vec_4 new_row) { - asm( - " ### mat_x4::set_row3 ### \n" - "vmulx.w col0, vf00, new_row \n" - "vmuly.w col1, vf00, new_row \n" - "vmulz.w col2, vf00, new_row \n" - "vmulw.w col3, vf00, new_row \n" - : "+j col0"(col0.vec128), - "+j col1"(col1.vec128), - "+j col2"(col2.vec128), - "+j col3"(col3.vec128) - : "j new_row"(new_row.vec128)); + vec128_t new_row_val = new_row.vec128; + asm __volatile__( + "qmtc2 %[col0], $vf1 \n" + "qmtc2 %[col1], $vf2 \n" + "qmtc2 %[col2], $vf3 \n" + "qmtc2 %[col3], $vf4 \n" + "qmtc2 %[new_row_val], $vf5 \n" + "vmulx.w $vf1, $vf0, $vf5 \n" + "vmuly.w $vf2, $vf0, $vf5 \n" + "vmulz.w $vf3, $vf0, $vf5 \n" + "vmulw.w $vf4, $vf0, $vf5 \n" + "qmfc2 %[col0], $vf1 \n" + "qmfc2 %[col1], $vf2 \n" + "qmfc2 %[col2], $vf3 \n" + "qmfc2 %[col3], $vf4 \n" + : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128) + : [new_row_val] "r"(new_row_val) + : "memory"); } column_type get_col0() const { return col0; } @@ -369,68 +502,96 @@ class mat_x4_template { vec_4 get_row0() const { vec128_t row; - asm( - " ### mat_x4::get_row0 ### \n" - "vaddx.x row_, vf00, col0 \n" - "vaddx.y row_, vf00, col1 \n" - "vaddx.z row_, vf00, col2 \n" - "vmulx.w row_, vf00, col3 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "vsub $vf5, $vf0, $vf0 \n" + "vaddx.x $vf5, $vf0, $vf1 \n" + "vaddx.y $vf5, $vf0, $vf2 \n" + "vaddx.z $vf5, $vf0, $vf3 \n" + "vmulx.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val) + : "memory"); return vec_4(row); } vec_4 get_row1() const { vec128_t row; - asm( - " ### mat_x4::get_row1 ### \n" - "vaddy.x row_, vf00, col0 \n" - "vaddy.y row_, vf00, col1 \n" - "vaddy.z row_, vf00, col2 \n" - "vmuly.w row_, vf00, col3 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "vsub $vf5, $vf0, $vf0 \n" + "vaddy.x $vf5, $vf0, $vf1 \n" + "vaddy.y $vf5, $vf0, $vf2 \n" + "vaddy.z $vf5, $vf0, $vf3 \n" + "vmuly.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val) + : "memory"); return vec_4(row); } vec_4 get_row2() const { vec128_t row; - asm( - " ### mat_x4::get_row2 ### \n" - "vaddz.x row_, vf00, col0 \n" - "vaddz.y row_, vf00, col1 \n" - "vaddz.z row_, vf00, col2 \n" - "vmulz.w row_, vf00, col3 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "vsub $vf5, $vf0, $vf0 \n" + "vaddz.x $vf5, $vf0, $vf1 \n" + "vaddz.y $vf5, $vf0, $vf2 \n" + "vaddz.z $vf5, $vf0, $vf3 \n" + "vmulz.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val) + : "memory"); return vec_4(row); } vec_4 get_row3() const { vec128_t row; - asm( - " ### mat_x4::get_row3 ### \n" - "vaddw.x row_, vf00, col0 \n" - "vaddw.y row_, vf00, col1 \n" - "vaddw.z row_, vf00, col2 \n" - "vmulw.w row_, vf00, col3 \n" - : "=&j row_"(row) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "vsub $vf5, $vf0, $vf0 \n" + "vaddw.x $vf5, $vf0, $vf1 \n" + "vaddw.y $vf5, $vf0, $vf2 \n" + "vaddw.z $vf5, $vf0, $vf3 \n" + "vmulw.w $vf5, $vf0, $vf4 \n" + "qmfc2 %[row], $vf5 \n" + : [row] "=&r"(row) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val) + : "memory"); return vec_4(row); } @@ -438,16 +599,26 @@ class mat_x4_template { operator*(vec_4 vec) const { vec128_t result; - asm( - " ### mat_x4 * vec_4 ### \n" - "vmulax ACC, col0, vec \n" - "vmadday ACC, col1, vec \n" - "vmaddaz ACC, col2, vec \n" - "vmaddw result, col3, vec \n" - : "=&j result"(result), "=r"(vu0_ACC) - : "j vec"(vec.vec128), - "j col0"(col0.vec128), "j col1"(col1.vec128), - "j col2"(col2.vec128), "j col3"(col3.vec128)); + vec128_t vec_val = vec.vec128; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[vec_val], $vf5 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf5 \n" + "qmfc2 %[result], $vf6 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [vec_val] "r"(vec_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val) + : "memory"); return column_type(result); } @@ -455,14 +626,23 @@ class mat_x4_template { operator*(vector_t vec) const { vec128_t result; - asm( - " ### mat_x4 * vector_t ### \n" - "vmulax ACC, col0, vec \n" - "vmadday ACC, col1, vec \n" - "vmaddz result, col2, vec \n" - : "=&j result"(result), "=r"(vu0_ACC) - : "j vec"(vec.vec128), - "j col0"(col0.vec128), "j col1"(col1.vec128), "j col2"(col2.vec128)); + vec128_t vec_val = vec.vec128; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[vec_val], $vf4 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf4 \n" + "vmadday $ACC, $vf2, $vf4 \n" + "vmaddz $vf5, $vf3, $vf4 \n" + "qmfc2 %[result], $vf5 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [vec_val] "r"(vec_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val) + : "memory"); return column_type(result); } @@ -470,16 +650,26 @@ class mat_x4_template { operator*(point_t pt) const { vec128_t result; - asm( - " ### mat_x4 * point_t ### \n" - "vmulax ACC, col0, pt \n" - "vmadday ACC, col1, pt \n" - "vmaddaz ACC, col2, pt \n" - "vmaddw result, col3, vf00 \n" - : "=&j result"(result), "=r"(vu0_ACC) - : "j pt"(pt.vec128), - "j col0"(col0.vec128), "j col1"(col1.vec128), - "j col2"(col2.vec128), "j col3"(col3.vec128)); + vec128_t pt_val = pt.vec128; + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[pt_val], $vf5 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf5 \n" + "vmaddaz $ACC, $vf3, $vf5 \n" + "vmaddw $vf6, $vf4, $vf0 \n" + "qmfc2 %[result], $vf6 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [pt_val] "r"(pt_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val) + : "memory"); return column_type(result); } @@ -488,20 +678,33 @@ class mat_x4_template { mult_trans_col0(const mat_x4_template& mat) const { vec128_t result; - asm("### mat_x4 mult_trans mat_x4, column 0 of result ### \n" - "vmulax ACC, col0, mat0 \n" - "vmaddax ACC, col1, mat1 \n" - "vmaddax ACC, col2, mat2 \n" - "vmaddx res0, col3, mat3 \n" - : "=&j res0"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128), - "j mat3"(mat.col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + vec128_t mat3_val = mat.col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[mat0_val], $vf5 \n" + "qmtc2 %[mat1_val], $vf6 \n" + "qmtc2 %[mat2_val], $vf7 \n" + "qmtc2 %[mat3_val], $vf8 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulax $ACC, $vf1, $vf5 \n" + "vmaddax $ACC, $vf2, $vf6 \n" + "vmaddax $ACC, $vf3, $vf7 \n" + "vmaddx $vf9, $vf4, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val) + : "memory"); return column_type(result); } @@ -510,20 +713,33 @@ class mat_x4_template { mult_trans_col1(const mat_x4_template& mat) const { vec128_t result; - asm("### mat_x4 mult_trans mat_x4, column 1 of result ### \n" - "vmulay ACC, col0, mat0 \n" - "vmadday ACC, col1, mat1 \n" - "vmadday ACC, col2, mat2 \n" - "vmaddy res1, col3, mat3 \n" - : "=&j res1"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128), - "j mat3"(mat.col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + vec128_t mat3_val = mat.col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[mat0_val], $vf5 \n" + "qmtc2 %[mat1_val], $vf6 \n" + "qmtc2 %[mat2_val], $vf7 \n" + "qmtc2 %[mat3_val], $vf8 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulay $ACC, $vf1, $vf5 \n" + "vmadday $ACC, $vf2, $vf6 \n" + "vmadday $ACC, $vf3, $vf7 \n" + "vmaddy $vf9, $vf4, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val) + : "memory"); return column_type(result); } @@ -532,20 +748,33 @@ class mat_x4_template { mult_trans_col2(const mat_x4_template& mat) const { vec128_t result; - asm("### mat_x4 mult_trans mat_x4, column 2 of result ### \n" - "vmulaz ACC, col0, mat0 \n" - "vmaddaz ACC, col1, mat1 \n" - "vmaddaz ACC, col2, mat2 \n" - "vmaddz res2, col3, mat3 \n" - : "=&j res2"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128), - "j mat3"(mat.col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + vec128_t mat3_val = mat.col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[mat0_val], $vf5 \n" + "qmtc2 %[mat1_val], $vf6 \n" + "qmtc2 %[mat2_val], $vf7 \n" + "qmtc2 %[mat3_val], $vf8 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulaz $ACC, $vf1, $vf5 \n" + "vmaddaz $ACC, $vf2, $vf6 \n" + "vmaddaz $ACC, $vf3, $vf7 \n" + "vmaddz $vf9, $vf4, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val) + : "memory"); return column_type(result); } @@ -554,20 +783,33 @@ class mat_x4_template { mult_trans_col3(const mat_x4_template& mat) const { vec128_t result; - asm("### mat_x4 mult_trans mat_x4, column 3 of result ### \n" - "vmulaw ACC, col0, mat0 \n" - "vmaddaw ACC, col1, mat1 \n" - "vmaddaw ACC, col2, mat2 \n" - "vmaddw res3, col3, mat3 \n" - : "=&j res3"(result), "=r"(vu0_ACC) - : "j col0"(col0.vec128), - "j col1"(col1.vec128), - "j col2"(col2.vec128), - "j col3"(col3.vec128), - "j mat0"(mat.col0.vec128), - "j mat1"(mat.col1.vec128), - "j mat2"(mat.col2.vec128), - "j mat3"(mat.col3.vec128)); + vec128_t col0_val = col0.vec128; + vec128_t col1_val = col1.vec128; + vec128_t col2_val = col2.vec128; + vec128_t col3_val = col3.vec128; + vec128_t mat0_val = mat.col0.vec128; + vec128_t mat1_val = mat.col1.vec128; + vec128_t mat2_val = mat.col2.vec128; + vec128_t mat3_val = mat.col3.vec128; + asm __volatile__( + "qmtc2 %[col0_val], $vf1 \n" + "qmtc2 %[col1_val], $vf2 \n" + "qmtc2 %[col2_val], $vf3 \n" + "qmtc2 %[col3_val], $vf4 \n" + "qmtc2 %[mat0_val], $vf5 \n" + "qmtc2 %[mat1_val], $vf6 \n" + "qmtc2 %[mat2_val], $vf7 \n" + "qmtc2 %[mat3_val], $vf8 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmulaw $ACC, $vf1, $vf5 \n" + "vmaddaw $ACC, $vf2, $vf6 \n" + "vmaddaw $ACC, $vf3, $vf7 \n" + "vmaddw $vf9, $vf4, $vf8 \n" + "qmfc2 %[result], $vf9 \n" + : [result] "=&r"(result), [acc] "=r"(vu0_ACC) + : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), + [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val) + : "memory"); return column_type(result); } }; diff --git a/include/ps2s/vector.h b/include/ps2s/vector.h index f4d5b83..1a46b51 100644 --- a/include/ps2s/vector.h +++ b/include/ps2s/vector.h @@ -211,25 +211,11 @@ class vec_x : public vec_template { inline vec_x(float x) { - asm( - " ### init vec_x with a float ### \n" - " .if %A0 \n" - " move %[_this], %[new_x] \n" - " .endif \n" - " \n" - " .if %A1 \n" - " lw %[_this], %[new_x] \n" - " .endif \n" - " \n" - " .if %A2 \n" - " mfc1 %[_this], %[new_x] \n" - " .endif \n" - " \n" - " .if %A3 \n" - " qmtc2 %[new_x], %[_this] \n" - " .endif \n" - : [_this] "=r,r,r,j"(vec128) - : [new_x] "r,m,f,r"(x)); + vec128_t temp; + asm __volatile__( + "mfc1 %[vec128], %[x] \n" + : [vec128] "=r"(vec128) + : [x] "f"(x)); } explicit inline vec_x(const vec_xyz rhs); @@ -240,25 +226,14 @@ class vec_x : public vec_template { inline operator float() const { float new_float; - asm( - " ### vec_x to float ### \n" - " .if %A0 \n" - " move %[new_float], %[_this] \n" - " .endif \n" - " \n" - " .if %A1 \n" - " sw %[_this], %[new_float] \n" - " .endif \n" - " \n" - " .if %A2 \n" - " mtc1 %[_this], %[new_float] \n" - " .endif \n" - " \n" - " .if %A3 \n" - " qmfc2 %[new_float], %[_this] \n" - " .endif \n" - : [new_float] "=r,m,f,r"(new_float) - : [_this] "r,r,r,j"(*this)); + vec128_t this_val = this->vec128; + uint32_t x_bits; + asm __volatile__( + "move %[x_bits], %[this_val] \n" + : [x_bits] "=r"(x_bits) + : [this_val] "r"(this_val) + : "memory"); + new_float = *reinterpret_cast(&x_bits); return new_float; } @@ -367,13 +342,16 @@ class vec_y : public vec_template { inline vec_y(float y) { - asm( - " ### init vec_y with a float ### \n" - "ctc2 %[f_y], $vi21 \n" - "vnop \n" - "vaddi.y %[_this], vf00, I \n" - : [_this] "=j"(vec128) - : [f_y] "r"(y)); + vec128_t result; + asm __volatile__( + "ctc2 %[f_y], $vi21 \n" + "vnop \n" + "vaddi.y $vf1, $vf0, $I \n" + "qmfc2 %[result], $vf1 \n" + : [result] "=r"(result) + : [f_y] "r"(y) + : "memory"); + vec128 = result; } explicit inline vec_y(const vec_xyz rhs); @@ -490,13 +468,16 @@ class vec_z : public vec_template { inline vec_z(float z) { - asm( - " ### init vec_z with a float ### \n" - "ctc2 %[f_z], $vi21 \n" - "vnop \n" - "vaddi.z %[_this], vf00, I \n" - : [_this] "=j"(vec128) - : [f_z] "r"(z)); + vec128_t result; + asm __volatile__( + "ctc2 %[f_z], $vi21 \n" + "vnop \n" + "vaddi.z $vf1, $vf0, $I \n" + "qmfc2 %[result], $vf1 \n" + : [result] "=r"(result) + : [f_z] "r"(z) + : "memory"); + vec128 = result; } explicit inline vec_z(const vec_xyz rhs); @@ -613,13 +594,16 @@ class vec_w : public vec_template { inline vec_w(float w) { - asm( - " ### init vec_w with a float ### \n" - "ctc2 %[f_w], $vi21 \n" - "vnop \n" - "vmuli.w %[_this], vf00, I \n" - : [_this] "=j"(vec128) - : [f_w] "r"(w)); + vec128_t result; + asm __volatile__( + "ctc2 %[f_w], $vi21 \n" + "vnop \n" + "vmuli.w $vf1, $vf0, $I \n" + "qmfc2 %[result], $vf1 \n" + : [result] "=r"(result) + : [f_w] "r"(w) + : "memory"); + vec128 = result; } explicit inline vec_w(const vec_xyzw rhs); @@ -1082,15 +1066,19 @@ class vec_xy : public vec_template { inline vec_xy one_over() const { vec128_t result; - asm(" ### reciprocal of vec_xy ### \n" - "vdiv Q, vf00w, %[_this]x \n" - "vwaitq \n" - "vaddq.x %[result], vf00, Q \n" - "vdiv Q, vf00w, %[_this]y \n" - "vwaitq \n" - "vaddq.y %[result], vf00, Q \n" - : [result] "=&j"(result) - : [_this] "j"(*this)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vdiv $Q, $vf0w, $vf1x \n" + "vwaitq \n" + "vaddq.x $vf2, $vf0, $Q \n" + "vdiv $Q, $vf0w, $vf1y \n" + "vwaitq \n" + "vaddq.y $vf2, $vf0, $Q \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val) + : "memory"); return vec_xy(result); } @@ -1330,18 +1318,22 @@ class vec_xyz : public vec_template { inline vec_xyz one_over() const { vec128_t result; - asm(" ### reciprocal of vec_xyz ### \n" - "vdiv Q, vf00w, %[_this]x \n" - "vwaitq \n" - "vaddq.x %[result], vf00, Q \n" - "vdiv Q, vf00w, %[_this]y \n" - "vwaitq \n" - "vaddq.y %[result], vf00, Q \n" - "vdiv Q, vf00w, %[_this]z \n" - "vwaitq \n" - "vaddq.z %[result], vf00, Q \n" - : [result] "=&j"(result) - : [_this] "j"(*this)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vdiv $Q, $vf0w, $vf1x \n" + "vwaitq \n" + "vaddq.x $vf2, $vf0, $Q \n" + "vdiv $Q, $vf0w, $vf1y \n" + "vwaitq \n" + "vaddq.y $vf2, $vf0, $Q \n" + "vdiv $Q, $vf0w, $vf1z \n" + "vwaitq \n" + "vaddq.z $vf2, $vf0, $Q \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val) + : "memory"); return vec_xyz(result); } @@ -1376,14 +1368,20 @@ class vec_xyz : public vec_template { dot(const vec_xyz rhs) const { vec128_t result, one; - asm( - " ### vec_xyz dot vec_xyz ### \n" - "vmul %[result], %[lhs], %[rhs] \n" - "vaddw.x %[one], vf00, vf00 \n" - "vadday.x ACC, %[result], %[result] \n" - "vmaddz.x %[result], %[one], %[result] \n" - : [result] "=j"(result), [one] "=j"(one) - : [lhs] "j"(*this), [rhs] "j"(rhs)); + vec128_t lhs_val = this->vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmul $vf3, $vf1, $vf2 \n" + "vaddw.x $vf4, $vf0, $vf0 \n" + "vadday.x $ACC, $vf3, $vf3 \n" + "vmaddz.x $vf5, $vf4, $vf3 \n" + "qmfc2 %[result], $vf5 \n" + "qmfc2 %[one], $vf4 \n" + : [result] "=r"(result), [one] "=r"(one), [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); return vec_x(result); } @@ -1394,24 +1392,28 @@ class vec_xyz : public vec_template { { vec128_t result, dot, one; int cond; - asm( - "### vec_xyz normalized ###\n" - "vmul %[dot], %[_this], %[_this] \n" - "vaddw.x %[one], vf00, vf00 \n" - "vadday.x ACC, %[dot], %[dot] \n" - "vmaddz.x %[dot], %[one], %[dot] \n" - "vrsqrt Q, vf00w, %[dot]x\n" - "cfc2 %[cond], $vi17\n" - "vsub %[result], %[result], %[result]\n" - "andi %[cond], %[cond], 8\n" - "bgtz %[cond], 0f \n" - "nop\n" - "vwaitq\n" - "vmulq %[result], %[_this], Q\n" - "0:\n" - : [result] "=&j"(result), [one] "=&j"(one), [dot] "=&j"(dot), [cond] "=r"(cond) - : [_this] "j"(*this)); - + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vmul $vf2, $vf1, $vf1 \n" + "vaddw.x $vf3, $vf0, $vf0 \n" + "vadday.x $ACC, $vf2, $vf2 \n" + "vmaddz.x $vf2, $vf3, $vf2 \n" + "vrsqrt $Q, $vf0w, $vf2x \n" + "cfc2 %[cond], $vi17 \n" + "vsub $vf4, $vf0, $vf0 \n" + "andi %[cond], %[cond], 8 \n" + "bgtz %[cond], 0f \n" + "nop \n" + "vwaitq \n" + "vmulq $vf4, $vf1, $Q \n" + "0: \n" + "qmfc2 %[result], $vf4 \n" + "qmfc2 %[dot], $vf2 \n" + "qmfc2 %[one], $vf3 \n" + : [result] "=&r"(result), [one] "=&r"(one), [dot] "=&r"(dot), [cond] "=r"(cond) + : [this_val] "r"(this_val) + : "memory"); return vec_xyz(result); } @@ -1419,12 +1421,17 @@ class vec_xyz : public vec_template { cross(const vec_xyz rhs) const { vec128_t result; - asm( - " ### vec_xyz cross vec_xyz ### \n" - "vopmula.xyz ACC, %[lhs], %[rhs] \n" - "vopmsub.xyz %[result], %[rhs], %[lhs] \n" - : [result] "=j"(result) - : [lhs] "j"(*this), [rhs] "j"(rhs)); + vec128_t lhs_val = this->vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vopmula $ACC, $vf1, $vf2 \n" + "vopmsub $vf3, $vf2, $vf1 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result), [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); return vec_xyz(result); } @@ -1725,21 +1732,25 @@ class vec_xyzw : public vec_template { inline vec_xyzw one_over() const { vec128_t result; - asm(" ### reciprocal of vec_xyzw ### \n" - "vdiv Q, vf00w, %[_this]x \n" - "vwaitq \n" - "vaddq.x %[result], vf00, Q \n" - "vdiv Q, vf00w, %[_this]y \n" - "vwaitq \n" - "vaddq.y %[result], vf00, Q \n" - "vdiv Q, vf00w, %[_this]z \n" - "vwaitq \n" - "vaddq.z %[result], vf00, Q \n" - "vdiv Q, vf00w, %[_this]w \n" - "vwaitq \n" - "vmulq.w %[result], vf00, Q \n" - : [result] "=&j"(result) - : [_this] "j"(*this)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vdiv $Q, $vf0w, $vf1x \n" + "vwaitq \n" + "vaddq.x $vf2, $vf0, $Q \n" + "vdiv $Q, $vf0w, $vf1y \n" + "vwaitq \n" + "vaddq.y $vf2, $vf0, $Q \n" + "vdiv $Q, $vf0w, $vf1z \n" + "vwaitq \n" + "vaddq.z $vf2, $vf0, $Q \n" + "vdiv $Q, $vf0w, $vf1w \n" + "vwaitq \n" + "vmulq.w $vf2, $vf0, $Q \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val) + : "memory"); return vec_xyzw(result); } @@ -1807,10 +1818,10 @@ class vec_xyzw : public vec_template { asm( "### vec_xyzw normalized ###\n" "vmul %[dot], %[_this], %[_this] \n" - "vaddw.x %[one, vf00, vf00 \n" + "vaddw.x %[one], vf00, vf00 \n" "vadday.x ACC, %[dot], %[dot] \n" "vmaddaz.x ACC, %[one], %[dot] \n" - "vmaddw.x %[dot, %[one, %[dot] \n" + "vmaddw.x %[dot], %[one], %[dot] \n" "vrsqrt Q, vf00w, %[dot]x\n" "cfc2 %[cond], $vi17\n" "vsub %[result], %[result], %[result]\n" @@ -2330,13 +2341,13 @@ class vector_t : public vec_template { vec128_t temp0; asm volatile( "mtsab $0, 4 # get ready to shift right 4 bytes \n" - "mtc1 %[_vec, %[_x] # x = value.x \n" + "mtc1 %[_vec], %[_x] # x = value.x \n" "qfsrv %[_temp0], $0, %[_vec] # temp0 = value >> 8 \n" "mtc1 %[_temp0], %[_y] # y = value.y \n" "qfsrv %[_temp0], $0, %[_temp0] # temp0 >>= 8 \n" "mtc1 %[_temp0], %[_z] # z = value.z \n" : [_x] "=f"(x), [_y] "=f"(y), [_z] "=f"(z), [_temp0] "=r"(temp0) - : [_vec] "r"(vec128)); + : [_vec] "r"(this->vec128)); printf("(%f %f %f (0.0))\n", x, y, z); } diff --git a/include/ps2s/vector_common.h b/include/ps2s/vector_common.h index febc4a8..2a1994a 100644 --- a/include/ps2s/vector_common.h +++ b/include/ps2s/vector_common.h @@ -7,6 +7,9 @@ #ifndef vector_common_h #define vector_common_h +#include +#include + /******************************************** * This file contains the class 'vec_template' which provides all vector * methods that can be generalized, as well as the '*_base' classes that @@ -65,8 +68,8 @@ */ register int vu0_ACC asm("$0"); -// let's use vf0 for this one to avoid a compiler warning -register int vu0_Q asm("vf0") __attribute__((mode(TI))); +// Q register is accessed via cfc2/ctc2 instructions, not as a global register +// We'll handle it differently in inline asm /******************************************** * constants @@ -175,11 +178,11 @@ class vec_template : public base_vec { // constructors inline vec_template() {} - inline vec_template(const vec128_t initVal) { vec128 = initVal; } + inline vec_template(const vec128_t initVal) { this->vec128 = initVal; } // accessors - inline vec128_t get128() const { return vec128; } + inline vec128_t get128() const { return this->vec128; } // mutators @@ -188,14 +191,17 @@ class vec_template : public base_vec { inline void set(const rhs_type rhs) { - asm(" ### set vec_l_fields = vec_r_fields ### \n" - "vmove.mask this, rhs \n" - : "=j this"(*this) - : "j rhs"(rhs), - "0"(*this), - "vu_mask l_fields"(this->valid_fields), - "vu_mask r_fields"(rhs.valid_fields), - "vu_mask mask"(this->valid_fields & rhs.valid_fields)); + vec128_t rhs_val = rhs.vec128; + unsigned int mask = this->valid_fields & rhs.valid_fields; + asm __volatile__( + "qmtc2 %[rhs_val], $vf1 \n" + "qmtc2 %[this_val], $vf2 \n" + "vsub $vf2, $vf0, $vf0 \n" + "vmove $vf2, $vf1 \n" + "qmfc2 %[this_val], $vf2 \n" + : [this_val] "+r"(this->vec128) + : [rhs_val] "r"(rhs_val), [mask] "r"(mask) + : "memory"); } // make sure that for vectors of the same type it's a simple assignment inline void @@ -207,22 +213,23 @@ class vec_template : public base_vec { inline void set(float rhs) { - asm("### set all vec_fields fields to float ### \n" - "pextlw rhs, rhs, rhs \n" - "pextlw this, rhs, rhs \n" - : "=r this"(*this) - : "r rhs"(rhs)); + asm __volatile__( + "pextlw %[this_val], %[rhs], %[rhs] \n" + "pextlw %[this_val], %[this_val], %[this_val] \n" + : [this_val] "+r"(this->vec128) + : [rhs] "r"(*reinterpret_cast(&rhs))); } inline void set_zero() { - - asm(" ### vec_l_fields set_zero ### \n" - "vsub.mask this, vf00, vf00 \n" - : "=j this"(*this) - : "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vsub $vf1, $vf0, $vf0 \n" + "qmfc2 %[this_val], $vf1 \n" + : [this_val] "+r"(this->vec128) + : + : "memory"); } static const unsigned int fields = (base_vec::broadcast_field << 4) | base_vec::valid_fields; @@ -241,30 +248,33 @@ class vec_template : public base_vec { { const lhs_type& lhs = *this; vec128_t result; - asm( - " ### vec_l_fields + vec_r_fields ### \n" - "vadd_bc._mask result, lhs, rhs \n" - : "=j result"(result) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vadd $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); return lhs_type(result); } inline lhs_type operator+(float rhs) const { vec128_t result; - asm( - " ### vec_l_fields + float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vaddi.mask result, lhs, I \n" - : "=j result"(result) - : "r rhs"(rhs), - "j lhs"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t lhs_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vaddi $vf2, $vf1, $I \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits) + : "memory"); return lhs_type(result); } template @@ -280,14 +290,16 @@ class vec_template : public base_vec { { const lhs_type& lhs = *this; vec128_t result; - asm( - " ### vec_l_fields - vec_r_fields ### \n" - "vsub_bc._mask %0, %1, %2 \n" - : "=j"(result) - : "j"(lhs), "j"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vsub $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); return lhs_type(result); } template @@ -303,32 +315,33 @@ class vec_template : public base_vec { { const lhs_type& lhs = *this; vec128_t result; - asm( - " ### vec_l_fields * vec_r_fields ### \n" - "vmul_bc._mask %0, %1, %2 \n" - : "=j"(result) - : "j"(lhs), "j"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); - + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmul $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); return lhs_type(result); } inline lhs_type operator*(float rhs) const { vec128_t result; - asm( - " ### vec_l_fields * float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vmuli.mask result, this, I \n" - : "=j result"(result) - : "r rhs"(rhs), - "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields) - : "vf31"); + vec128_t this_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vmuli $vf2, $vf1, $I \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_bits] "r"(rhs_bits) + : "memory"); return lhs_type(result); } template @@ -344,15 +357,18 @@ class vec_template : public base_vec { { const lhs_type& lhs = *this; vec128_t result; - asm( - " ### vec_l_fields / vec__r_bc ### \n" - "vdiv Q, vf00w, rhs_r_bc \n" - "vwaitq \n" - "vmulq.l_fields result, lhs, Q \n" - : "=j result"(result) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_mask l_fields"(lhs.valid_fields), - "vu_bc _r_bc"(rhs.broadcast_field)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vdiv $Q, $vf0w, $vf2x \n" + "vwaitq \n" + "vmulq $vf3, $vf1, $Q \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); return lhs_type(result); } inline lhs_type @@ -376,15 +392,15 @@ class vec_template : public base_vec { operator-() const { vec128_t result; - asm( - " ### negate vec_l_fields ### \n" - "vsuba.mask ACC, vf00, vf00 \n" - "vmsubw.mask result, this, vf00w \n" - - : "=j result"(result), "=r"(vu0_ACC) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vsuba $ACC, $vf0, $vf0 \n" + "vmsubw $vf2, $vf1, $vf0w \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result), "=r"(vu0_ACC) + : [this_val] "r"(this_val) + : "memory"); return lhs_type(result); } @@ -394,21 +410,21 @@ class vec_template : public base_vec { no_w_negate() const { vec128_t result; - asm( - " ### negate vec_l_fields (no w) ### \n" - "vsub.mask result, vf00, this \n" - - : "=j result"(result) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vsub $vf2, $vf0, $vf1 \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val) + : "memory"); return lhs_type(result); } // add and multiply-add functions. // You can write the accumulator in one method and add or // madd to it in another - data dependency is respected - // due to the vu0_ACC global (see declaration above)· + // due to the vu0_ACC global (see declaration above)� // However, you are responsible for matching the type // written to and read from the accumulator - use with // care. @@ -418,13 +434,31 @@ class vec_template : public base_vec { inline void to_a() const { - asm( - " ### ACC = vec_fields ### \n" - "vmulaw.mask ACC, this, vf00 \n" - : "=r"(vu0_ACC) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask fields"(this->valid_fields)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + // Set ACC to this vector: ACC = vf1 + // Use vmulaw: ACC = ACC + vf1 * vf0w = ACC + vf1 * 1.0 + // To set ACC = vf1 (not add), we need ACC to be 0 first + // Use vmula to set ACC = vf1 * (1,1,1,1) + // Create (1,1,1,1) by using vf0w which is 1.0 + // Actually, vmula sets ACC = src1 * src2 (not ACC + src1 * src2) + // So vmula $ACC, $vf1, $vf0 does: ACC = vf1 * vf0 + // But vf0 is (0,0,0,1), so we'd get (0,0,0,vf1.w) + // Need to use vmove to broadcast vf0w or use a different approach + // Let's use vmulaw with zero ACC: if ACC is 0, then ACC = 0 + vf1 * 1.0 = vf1 + // But ACC might not be 0. Let's use vmula with a vector of ones + // Actually, simplest: use vmove to copy vf1 to a temp, then use vmula + "vmove $vf2, $vf1 \n" // vf2 = vf1 + "vmulaw $ACC, $vf2, $vf0 \n" // ACC = ACC + vf2 * vf0w = ACC + vf1 * 1.0 + // If ACC is not zero, this won't work. Let's zero ACC first using vmula with zero + "vsub $vf3, $vf0, $vf0 \n" // vf3 = 0 + "vmula $ACC, $vf3, $vf3 \n" // ACC = vf3 * vf3 = 0 * 0 = 0 + "vmulaw $ACC, $vf1, $vf0 \n" // ACC = ACC + vf1 * vf0w = 0 + vf1 * 1.0 = vf1 + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [this_val] "r"(this_val) + : "memory"); } // from_a (from accumulator): this = ACC @@ -432,13 +466,15 @@ class vec_template : public base_vec { inline void from_a() { - asm( - " ### vec_fields = ACC ### \n" - "vmaddx.mask this, vf00, vf00 \n" - : "=j this"(*this) - : "r"(vu0_ACC), - "vu_mask mask"(this->valid_fields), - "vu_mask fields"(this->valid_fields)); + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + // Read ACC: use vmaddw with zero to get ACC + 0 = ACC + "vsub $vf1, $vf0, $vf0 \n" + "vmaddw $vf1, $vf1, $vf0 \n" // vf1 = ACC + vf1*vf0 = ACC + 0 = ACC + "qmfc2 %[this_val], $vf1 \n" + : [this_val] "+r"(this->vec128) + : [acc] "r"(vu0_ACC) + : "memory"); } // mula (multiply, to accumulator): ACC = this * rhs @@ -448,29 +484,32 @@ class vec_template : public base_vec { mula(rhs_type rhs) const { const lhs_type& lhs = *this; - asm( - " ### ACC = vec_l_fields * vec_r_fields ### \n" - "vmula_bc._mask ACC, lhs, rhs \n" - : "=r"(vu0_ACC) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmula $ACC, $vf1, $vf2 \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); } inline void mula(float rhs) const { - asm( - " ### ACC = vec_l_fields * float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vmulai.mask ACC, lhs, I \n" - : "=r"(vu0_ACC) - : "r rhs"(rhs), - "j lhs"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t lhs_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vmulai $ACC, $vf1, $I \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits) + : "memory"); } // aadd (accumulator add): result = ACC + this @@ -480,15 +519,15 @@ class vec_template : public base_vec { aadd() const { vec128_t result; - asm( - " ### ACC + vec_l_fields ### \n" - "vmaddw.mask result, this, vf00 \n" - - : "=j result"(result) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields), - "r"(vu0_ACC)); + vec128_t this_val = this->vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + "vmaddw $vf2, $vf1, $vf0 \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [acc] "r"(vu0_ACC) + : "memory"); return lhs_type(result); } @@ -496,15 +535,15 @@ class vec_template : public base_vec { asub() const { vec128_t result; - asm( - " ### ACC - vec_l_fields ### \n" - "vmsubw.mask result, this, vf00 \n" - - : "=j result"(result) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields), - "r"(vu0_ACC)); + vec128_t this_val = this->vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + "vmsubw $vf2, $vf1, $vf0 \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [acc] "r"(vu0_ACC) + : "memory"); return lhs_type(result); } @@ -516,29 +555,32 @@ class vec_template : public base_vec { adda(rhs_type rhs) const { const lhs_type& lhs = *this; - asm( - " ### ACC = vec_l_fields + vec_r_fields ### \n" - "vadda_bc._mask ACC, lhs, rhs \n" - : "=r"(vu0_ACC) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vadda $ACC, $vf1, $vf2 \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); } inline void adda(float rhs) const { - asm( - " ### ACC = vec_l_fields + float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vaddai.mask ACC, lhs, I \n" - : "=r"(vu0_ACC) - : "r rhs"(rhs), - "j lhs"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t lhs_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vaddai $ACC, $vf1, $I \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits) + : "memory"); } template @@ -546,29 +588,32 @@ class vec_template : public base_vec { suba(rhs_type rhs) const { const lhs_type& lhs = *this; - asm( - " ### ACC = vec_l_fields - vec_r_fields ### \n" - "vsuba_bc._mask ACC, lhs, rhs \n" - : "=r"(vu0_ACC) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vsuba $ACC, $vf1, $vf2 \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); } inline void suba(float rhs) const { - asm( - " ### ACC = vec_l_fields - float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vsubai.mask ACC, lhs, I \n" - : "=r"(vu0_ACC) - : "r rhs"(rhs), - "j lhs"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t lhs_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "qmtc2 %[lhs_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vsubai $ACC, $vf1, $I \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "=r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits) + : "memory"); } // aadda (accumulator add, to accumulator): ACC = ACC + this @@ -577,15 +622,26 @@ class vec_template : public base_vec { inline lhs_type aadda() const { + // Add this vector to ACC: ACC = ACC + this + // Then return ACC + vec128_t this_val = this->vec128; vec128_t result; - asm( - " ### ACC = ACC + vec_l_fields ### \n" - "vmaddaw.mask ACC, this, vf00 \n" - - : "+r"(vu0_ACC) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + // First, read ACC into vf2: vf2 = ACC + "vsub $vf2, $vf0, $vf0 \n" // vf2 = 0 + "vmaddw $vf2, $vf2, $vf0 \n" // vf2 = ACC + vf2 * vf0w = ACC + 0 = ACC + // Now add this vector: ACC = vf2 + vf1 = ACC + this + "vadda $ACC, $vf2, $vf1 \n" // ACC = vf2 + vf1 = (old ACC) + this + "cfc2 %[acc], $vi16 \n" + // Read result from ACC + "vsub $vf3, $vf0, $vf0 \n" // vf3 = 0 + "vmaddw $vf3, $vf3, $vf0 \n" // vf3 = ACC + 0 = ACC + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result), [acc] "+r"(vu0_ACC) + : [this_val] "r"(this_val) + : "memory"); return lhs_type(result); } @@ -593,14 +649,23 @@ class vec_template : public base_vec { asuba() const { vec128_t result; - asm( - " ### ACC = ACC - vec_l_fields ### \n" - "vmsubaw.mask ACC, this, vf00 \n" - - : "+r"(vu0_ACC) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t this_val = this->vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + // First, read ACC into vf2: vf2 = ACC + "vsub $vf2, $vf0, $vf0 \n" // vf2 = 0 + "vmaddw $vf2, $vf2, $vf0 \n" // vf2 = ACC + 0 = ACC + // Now subtract this vector: ACC = vf2 - vf1 = ACC - this + "vsuba $ACC, $vf2, $vf1 \n" // ACC = vf2 - vf1 = (old ACC) - this + "cfc2 %[acc], $vi16 \n" + // Read result from ACC + "vsub $vf3, $vf0, $vf0 \n" // vf3 = 0 + "vmaddw $vf3, $vf3, $vf0 \n" // vf3 = ACC + 0 = ACC + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result), [acc] "+r"(vu0_ACC) + : [this_val] "r"(this_val) + : "memory"); return lhs_type(result); } @@ -612,16 +677,17 @@ class vec_template : public base_vec { madd(rhs_type rhs) const { vec128_t result; - asm( - " ### ACC + vec_l_fields * vec_r_fields ### \n" - "vmadd_bc._mask result, this, rhs \n" - - : "=j result"(result) - : "j this"(*this), "j rhs"(rhs), - "vu_binary _bc _mask"(this->fields << 8 | rhs.fields), - "vu_mask l_fields"(this->valid_fields), - "vu_mask r_fields"(rhs.valid_fields), - "r"(vu0_ACC)); + vec128_t this_val = this->vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmadd $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val), [acc] "r"(vu0_ACC) + : "memory"); return lhs_type(result); } @@ -629,16 +695,18 @@ class vec_template : public base_vec { madd(float rhs) const { vec128_t result; - asm( - " ### ACC + vec_l_fields * float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vmaddi._mask result, this, I \n" - : "=j result"(result) - : "j this"(*this), "r rhs"(rhs), - "vu_mask _mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields), - "r"(vu0_ACC)); + vec128_t this_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vmaddi $vf2, $vf1, $I \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_bits] "r"(rhs_bits), [acc] "r"(vu0_ACC) + : "memory"); return lhs_type(result); } @@ -647,16 +715,17 @@ class vec_template : public base_vec { msub(rhs_type rhs) const { vec128_t result; - asm( - " ### ACC - vec_l_fields * vec_r_fields ### \n" - "vmsub_bc._mask result, this, rhs \n" - - : "=j result"(result) - : "j this"(*this), "j rhs"(rhs), - "vu_binary _bc _mask"(this->fields << 8 | rhs.fields), - "vu_mask l_fields"(this->valid_fields), - "vu_mask r_fields"(rhs.valid_fields), - "r"(vu0_ACC)); + vec128_t this_val = this->vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmsub $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val), [acc] "r"(vu0_ACC) + : "memory"); return lhs_type(result); } @@ -664,16 +733,18 @@ class vec_template : public base_vec { msub(float rhs) const { vec128_t result; - asm( - " ### ACC - vec_l_fields * float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vmsubi._mask result, this, I \n" - : "=j result"(result) - : "j this"(*this), "r rhs"(rhs), - "vu_mask _mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields), - "r"(vu0_ACC)); + vec128_t this_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[this_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vmsubi $vf2, $vf1, $I \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_bits] "r"(rhs_bits), [acc] "r"(vu0_ACC) + : "memory"); return lhs_type(result); } @@ -685,29 +756,34 @@ class vec_template : public base_vec { madda(rhs_type rhs) const { const lhs_type& lhs = *this; - asm( - " ### ACC = ACC + vec_l_fields * vec_r_fields ### \n" - "vmadda_bc._mask ACC, lhs, rhs \n" - : "+r"(vu0_ACC) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmadda $ACC, $vf1, $vf2 \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "+r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); } inline void madda(float rhs) const { - asm( - " ### ACC = ACC + vec_l_fields * float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vmaddai.mask ACC, lhs, I \n" - : "+r"(vu0_ACC) - : "r rhs"(rhs), - "j lhs"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t lhs_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[lhs_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vmaddai $ACC, $vf1, $I \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "+r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits) + : "memory"); } template @@ -715,42 +791,48 @@ class vec_template : public base_vec { msuba(rhs_type rhs) const { const lhs_type& lhs = *this; - asm( - " ### ACC = ACC - vec_l_fields * vec_r_fields ### \n" - "vmsuba_bc._mask ACC, lhs, rhs \n" - : "+r"(vu0_ACC) - : "j lhs"(lhs), "j rhs"(rhs), - "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields), - "vu_mask l_fields"(lhs.valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t lhs_val = lhs.vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[lhs_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmsuba $ACC, $vf1, $vf2 \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "+r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val) + : "memory"); } inline void msuba(float rhs) const { - asm( - " ### ACC = ACC - vec_l_fields * float ### \n" - "ctc2 rhs, $vi21 \n" - "vnop \n" - "vmsubai.mask ACC, lhs, I \n" - : "+r"(vu0_ACC) - : "r rhs"(rhs), - "j lhs"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask l_fields"(this->valid_fields)); + vec128_t lhs_val = this->vec128; + uint32_t rhs_bits = *reinterpret_cast(&rhs); + asm __volatile__( + "ctc2 %[acc], $vi16 \n" + "qmtc2 %[lhs_val], $vf1 \n" + "ctc2 %[rhs_bits], $vi21 \n" + "vnop \n" + "vmsubai $ACC, $vf1, $I \n" + "cfc2 %[acc], $vi16 \n" + : [acc] "+r"(vu0_ACC) + : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits) + : "memory"); } inline lhs_type abs() const { vec128_t result; - asm( - " ### absolute value of vec_fields ### \n" - "vabs.mask result, this \n" - : "=j result"(result) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask fields"(this->valid_fields)); + vec128_t this_val = this->vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "vabs $vf2, $vf1 \n" + "qmfc2 %[result], $vf2 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val) + : "memory"); return lhs_type(result); } @@ -758,20 +840,45 @@ class vec_template : public base_vec { sign() const { vec128_t result; - int temp; - int ones = 0x3f800000; - asm( - " ### signs of vec_fields ### \n" - "vmulx.mask result, this, vf00 \n" - "pextlw ones, ones, ones \n" - "pextlw ones, ones, ones \n" - "qmfc2 temp, result \n" - "por temp, temp, ones \n" - "qmtc2 temp, result \n" - : "=j result"(result), "=r temp"(temp), "+r ones"(ones) - : "j this"(*this), - "vu_mask mask"(this->valid_fields), - "vu_mask fields"(this->valid_fields)); + vec128_t this_val = this->vec128; + uint32_t macflag; + + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + // Get absolute value + "vabs $vf2, $vf1 \n" + // Divide original by absolute value to get sign + // For non-zero: sign = original / abs = ±1 + // For zero: division by zero will set MACflag + "vdiv $Q, $vf0w, $vf2x \n" + "vwaitq \n" + "vmulq.x $vf3, $vf1, $Q \n" + "vdiv $Q, $vf0w, $vf2y \n" + "vwaitq \n" + "vmulq.y $vf3, $vf1, $Q \n" + "vdiv $Q, $vf0w, $vf2z \n" + "vwaitq \n" + "vmulq.z $vf3, $vf1, $Q \n" + "vdiv $Q, $vf0w, $vf2w \n" + "vwaitq \n" + "vmulq.w $vf3, $vf1, $Q \n" + // Check MACflag for each component to detect zero + // If division by zero occurred, set that component to 0 + "cfc2 %[macflag], $vi17 \n" + "andi %[macflag], %[macflag], 8 \n" // Check DZ (divide by zero) flag + "bgtz %[macflag], 1f \n" + "nop \n" + "qmfc2 %[result], $vf3 \n" + "j 2f \n" + "nop \n" + "1: \n" + // If zero detected, set result to zero + "vsub $vf3, $vf0, $vf0 \n" + "qmfc2 %[result], $vf3 \n" + "2: \n" + : [result] "=r"(result), [macflag] "=r"(macflag) + : [this_val] "r"(this_val) + : "memory"); return lhs_type(result); } @@ -780,13 +887,16 @@ class vec_template : public base_vec { max(rhs_type rhs) const { vec128_t result; - asm(" ### maximum of vec_l_fields and vec_r_fields ### \n" - "vmax_bc._mask result, this, rhs \n" - : "=j result"(result) - : "j this"(*this), "j rhs"(rhs), - "vu_binary _bc _mask"(this->fields << 8 | rhs.fields), - "vu_mask l_fields"(this->valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t this_val = this->vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmax $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val) + : "memory"); return lhs_type(result); } @@ -795,13 +905,16 @@ class vec_template : public base_vec { min(rhs_type rhs) const { vec128_t result; - asm(" ### minimum of vec_l_fields and vec_r_fields ### \n" - "vmini_bc._mask result, this, rhs \n" - : "=j result"(result) - : "j this"(*this), "j rhs"(rhs), - "vu_binary _bc _mask"(this->fields << 8 | rhs.fields), - "vu_mask l_fields"(this->valid_fields), - "vu_mask r_fields"(rhs.valid_fields)); + vec128_t this_val = this->vec128; + vec128_t rhs_val = rhs.vec128; + asm __volatile__( + "qmtc2 %[this_val], $vf1 \n" + "qmtc2 %[rhs_val], $vf2 \n" + "vmini $vf3, $vf1, $vf2 \n" + "qmfc2 %[result], $vf3 \n" + : [result] "=r"(result) + : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val) + : "memory"); return lhs_type(result); } @@ -819,7 +932,7 @@ class vec_template : public base_vec { "qfsrv %4, $0, %4 # temp0 >>= 8 \n" "mtc1 %4, %3 # w = value.w \n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=r"(temp0) - : "r"(vec128)); + : "r"(this->vec128)); printf("(%f %f %f %f)\n", x, y, z, w); } diff --git a/include/ps2s/vu.h b/include/ps2s/vu.h index 460d4c9..64f7ff5 100644 --- a/include/ps2s/vu.h +++ b/include/ps2s/vu.h @@ -9,6 +9,7 @@ #include "ps2s/debug.h" #include "ps2s/types.h" +#include "ps2s/core.h" /******************************************** * common @@ -43,20 +44,25 @@ void VU0::CopyQwordsToVU0(uint32_t vu0QwordOffset, uint128_t* mainMemSrc, uint32 { mAssert(((uint32_t)mainMemSrc & 0xf) == 0); - asm volatile(" - .set noreorder - ctc2 - % 2, - $vi03 - sll $8, - % 0, 4 /* num bytes */ - addu $8, - % 1, $8 /* ending address */ - .align 8 0 - : lqc2 vf02, 0(% 1) addiu % 1, % 1, 16 bne % 1, $8, 0b vsqi vf02, ($vi03++).set reorder " - : "+r"(numQwords), "+r"(mainMemSrc) - : "r"(vu0QwordOffset) - : "cc", "$8"); + // VU0 memory is mapped starting at VU0Code + // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16) + uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4); + + asm __volatile__( + "addu $8, %0, $zero \n" + "sll $9, %1, 4 \n" + "addu $10, $8, $9 \n" + "addu $11, %2, $zero \n" + "0: \n" + "lqc2 $vf2, 0($8) \n" + "addiu $8, $8, 16 \n" + "sqc2 $vf2, 0($11) \n" + "addiu $11, $11, 16 \n" + "bne $8, $10, 0b \n" + "nop \n" + : + : "r"(mainMemSrc), "r"(numQwords), "r"(vu0_addr) + : "memory", "$8", "$9", "$10", "$11"); } void VU0::CopyEvenQwordsToVU0(uint32_t vu0QwordOffset, uint128_t* mainMemSrc, uint32_t numQwords) @@ -64,47 +70,54 @@ void VU0::CopyEvenQwordsToVU0(uint32_t vu0QwordOffset, uint128_t* mainMemSrc, ui mAssert(((uint32_t)mainMemSrc & 0xf) == 0); mErrorIf(numQwords & 1, "numQwords must be EVEN!"); - asm volatile(" - .set noreorder - - ctc2 - % 2, - $vi04 - sll $8, - % 0, 4 /* num bytes */ - addu $8, - % 1, $8 /* ending address */ - .balign 8 0 - : lqc2 vf02, 0(% 1) addiu % 1, % 1, 16 lqc2 vf03, 0(% 1) addiu % 1, % 1, 16 vsqi vf02, ($vi04++)nop bne % 1, $8, 0b vsqi vf03, ($vi04++) - - .set reorder " - : "+r"(numQwords), "+r"(mainMemSrc) - : "r"(vu0QwordOffset) - : "cc", "$8"); + // VU0 memory is mapped starting at VU0Code + // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16) + uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4); + + asm __volatile__( + "addu $8, %0, $zero \n" + "sll $9, %1, 4 \n" + "addu $10, $8, $9 \n" + "addu $11, %2, $zero \n" + "0: \n" + "lqc2 $vf2, 0($8) \n" + "addiu $8, $8, 16 \n" + "lqc2 $vf3, 0($8) \n" + "addiu $8, $8, 16 \n" + "sqc2 $vf2, 0($11) \n" + "addiu $11, $11, 16 \n" + "sqc2 $vf3, 0($11) \n" + "addiu $11, $11, 16 \n" + "bne $8, $10, 0b \n" + "nop \n" + : + : "r"(mainMemSrc), "r"(numQwords), "r"(vu0_addr) + : "memory", "$8", "$9", "$10", "$11"); } void VU0::CopyQwordsFromVU0(uint128_t* mainMemDest, uint32_t vu0QwordOffset, uint32_t numQwords) { mAssert(((uint32_t)mainMemDest & 0xf) == 0); - asm volatile(" - .set noreorder - - ctc2 - % 2, - $vi02 - addi - % 0, - % 0, -16 sll $8, % 1, 4 /* num bytes */ - addu $8, - $8, % 0 /* ending address */ - .balign 8 0 - : vlqi vf01, ($vi02++)addiu % 0, % 0, 16 bne % 0, $8, 0b sqc2 vf01, 0(% 0) - - .set reorder " - : "+r"(mainMemDest) - : "r"(numQwords), "r"(vu0QwordOffset) - : "$8", "cc", "memory"); + // VU0 memory is mapped starting at VU0Code + // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16) + uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4); + + asm __volatile__( + "addu $8, %0, $zero \n" + "sll $9, %1, 4 \n" + "addu $10, $8, $9 \n" + "addu $11, %2, $zero \n" + "0: \n" + "lqc2 $vf1, 0($11) \n" + "addiu $11, $11, 16 \n" + "sqc2 $vf1, 0($8) \n" + "addiu $8, $8, 16 \n" + "bne $8, $10, 0b \n" + "nop \n" + : + : "r"(mainMemDest), "r"(numQwords), "r"(vu0_addr) + : "memory", "$8", "$9", "$10", "$11"); } void VU0::CopyEvenQwordsFromVU0(uint128_t* mainMemDest, uint32_t vu0QwordOffset, uint32_t numQwords) @@ -112,40 +125,29 @@ void VU0::CopyEvenQwordsFromVU0(uint128_t* mainMemDest, uint32_t vu0QwordOffset, mAssert(((uint32_t)mainMemDest & 0xf) == 0); mErrorIf(numQwords & 1, "numQwords must be EVEN!"); - asm volatile(" - .set noreorder - - ctc2 - % 2, - $vi02 - addi - % 0, - % 0, -16 sll $8, % 1, 4 /* num bytes */ - addu $8, - $8, % 0 /* ending address */ - .balign 8 0 - : addiu % 0, % 0, 16 vlqi vf01, ($vi02++) - - vlqi vf02, - ($vi02++) - nop - - sqc2 vf01, - 0(% 0) - nop - - addu - % 0, - % 0, 16 nop - - bne - % 0, - $8, 0b sqc2 vf02, 0(% 0) - - .set reorder " - : "+r"(mainMemDest) - : "r"(numQwords), "r"(vu0QwordOffset) - : "$8", "cc", "memory"); + // VU0 memory is mapped starting at VU0Code + // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16) + uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4); + + asm __volatile__( + "addu $8, %0, $zero \n" + "sll $9, %1, 4 \n" + "addu $10, $8, $9 \n" + "addu $11, %2, $zero \n" + "0: \n" + "lqc2 $vf1, 0($11) \n" + "addiu $11, $11, 16 \n" + "sqc2 $vf1, 0($8) \n" + "addiu $8, $8, 16 \n" + "lqc2 $vf2, 0($11) \n" + "addiu $11, $11, 16 \n" + "sqc2 $vf2, 0($8) \n" + "addiu $8, $8, 16 \n" + "bne $8, $10, 0b \n" + "nop \n" + : + : "r"(mainMemDest), "r"(numQwords), "r"(vu0_addr) + : "memory", "$8", "$9", "$10", "$11"); } #endif // ps2s_vu_h diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7c13456..bf32b4b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,4 +1,55 @@ # Tests for ps2stuff -# TODO: Add test executables here when test sources are available -message(STATUS "Tests directory configured (no tests defined yet)") +# VU0 path: test_shared.cpp compiled with VU0 assembly (default) +add_executable(test_vu0.elf + test_shared.cpp +) + +target_include_directories(test_vu0.elf PRIVATE + ${CMAKE_SOURCE_DIR}/include +) + +target_link_libraries(test_vu0.elf + ps2stuff + m +) + +find_program(PS2CLIENT_EXECUTABLE ps2client) +if(PS2CLIENT_EXECUTABLE) + add_custom_target(run_test_vu0 + COMMAND ${PS2CLIENT_EXECUTABLE} execee host:test_vu0.elf + DEPENDS test_vu0.elf + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Running test_vu0.elf on PS2 via ps2client" + ) +endif() + +message(STATUS "Configured test target: test_vu0.elf") + +add_executable(test_cpu_shared.elf + test_shared.cpp +) + +target_include_directories(test_cpu_shared.elf PRIVATE + ${CMAKE_SOURCE_DIR}/include +) + +target_compile_definitions(test_cpu_shared.elf PRIVATE + USE_CPU_COMPAT +) + +target_link_libraries(test_cpu_shared.elf + ps2stuff + m +) + +if(PS2CLIENT_EXECUTABLE) + add_custom_target(run_test_cpu_shared + COMMAND ${PS2CLIENT_EXECUTABLE} execee host:test_cpu_shared.elf + DEPENDS test_cpu_shared.elf + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Running test_cpu_shared.elf on PS2 via ps2client" + ) +endif() + +message(STATUS "Configured test target: test_cpu_shared.elf") diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..536a004 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,85 @@ +# VU0 Unit Tests + +This directory contains unit tests for the VU0 functions that were converted from custom GCC extensions to standard GCC inline assembly. + +## Building + +Configure and build from the repository root with tests enabled: + +```bash +cmake -S . -B build -DBUILD_TESTS=ON +cmake --build build -j +``` + +## Running + +To run the tests on a PS2 via ps2client: + +```bash +cd build/tests +ps2client execee host:test_vu0.elf +``` + +Or manually: + +```bash +/Applications/PCSX2.app/Contents/MacOS/PCSX2 -elf $(pwd)/test_vu0.elf +``` + +Note on emulation differences: + +- Some PS2 VU floating-point corner cases are not emulated identically by + popular emulators (for example, specific VMUL cases where `1 * X` may not + equal `X` on real hardware). If tests behave differently under PCSX2 vs + real hardware, see: https://fobes.dev/ps2/detecting-emu-vu-floats and + consider running the affected tests on real hardware or making assertions + tolerant to small, platform-dependent float differences. + +## Test Coverage + +The tests cover the following VU0 functions: + +### Vector Operations (vec_xyz) +- Addition, subtraction, multiplication, division +- Scalar multiplication +- Negation +- Absolute value +- Max/min operations +- Normalization +- Set operations (set_zero, set, set_vec) +- Compound assignment operators (+=, -=, *=) +- Dot product +- Length and length squared +- Cross product + +### Vector Operations (vec_xyzw) +- Addition, subtraction +- Scalar multiplication + +### Matrix Operations (mat_33) +- set_zero +- set_row/get_row +- Matrix-vector multiplication + +### Matrix Operations (mat_44) +- set_zero +- set_row/get_row +- Matrix-vector multiplication + +### Accumulator Operations +- Basic accumulator function tests + +## Test Framework + +The test framework uses simple macros: +- `ASSERT_EQ(a, b)` - Asserts two floats are equal within EPSILON +- `ASSERT_VEC3_EQ(v, x, y, z)` - Asserts a vec_xyz matches expected values +- `ASSERT_VEC_EQ(v, x, y, z, w)` - Asserts a vec_xyzw matches expected values +- `RUN_TEST(func)` - Runs a test function and reports results + +## Notes + +- EPSILON is set to 1e-5f for floating-point comparisons +- Tests use `vec_x(v)`, `vec_y(v)`, etc. to extract vector components +- Some functions in vector.h still use "j" constraints and may need conversion if you encounter issues + diff --git a/tests/test_shared.cpp b/tests/test_shared.cpp new file mode 100644 index 0000000..7303b8a --- /dev/null +++ b/tests/test_shared.cpp @@ -0,0 +1,2546 @@ +/* Copyright (C) 2000,2001,2002 Sony Computer Entertainment America + * + * This file is subject to the terms and conditions of the GNU Lesser + * General Public License Version 2.1. See the file "COPYING" in the + * main directory of this archive for more details. */ + +#include +#include +#ifdef USE_CPU_COMPAT +#include +#define NO_VU0_VECTORS +#include +#include +#undef NO_VU0_VECTORS +#else +#include +#include +#include +#include +// Also exercise the cpu_vec_4 / cpu_mat_44 VU0-asm fast paths. +#include +#include +#endif +#include +#include + +// Test framework macros +#define EPSILON 1e-5f +#define ASSERT_EQ(a, b) \ + do { \ + if (fabsf((a) - (b)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected %.6f, got %.6f\n", __FILE__, __LINE__, (float)(b), (float)(a)); \ + return 1; \ + } \ + } while(0) + +#define ASSERT_VEC_EQ(v, x, y, z, w) \ + do { \ + float vx = (float)vec_x(v); \ + float vy = (float)vec_y(v); \ + float vz = (float)vec_z(v); \ + float vw = (float)vec_w(v); \ + if (fabsf(vx - (x)) > EPSILON || \ + fabsf(vy - (y)) > EPSILON || \ + fabsf(vz - (z)) > EPSILON || \ + fabsf(vw - (w)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f, %.6f)\n", \ + __FILE__, __LINE__, (float)(x), (float)(y), (float)(z), (float)(w), \ + vx, vy, vz, vw); \ + return 1; \ + } \ + } while(0) + +#define ASSERT_VEC3_EQ(v, x, y, z) \ + do { \ + float vx = (float)vec_x(v); \ + float vy = (float)vec_y(v); \ + float vz = (float)vec_z(v); \ + if (fabsf(vx - (x)) > EPSILON || \ + fabsf(vy - (y)) > EPSILON || \ + fabsf(vz - (z)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f)\n", \ + __FILE__, __LINE__, (float)(x), (float)(y), (float)(z), \ + vx, vy, vz); \ + return 1; \ + } \ + } while(0) + +// CPU legacy type macros (cpu_vec_3, cpu_vec_4 have direct .x/.y/.z/.w members) +#ifdef USE_CPU_COMPAT +#define ASSERT_FLOAT_EQ(a, b) \ + do { \ + if (fabsf((float)(a) - (float)(b)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected %.6f, got %.6f\n", __FILE__, __LINE__, (float)(b), (float)(a)); \ + return 1; \ + } \ + } while(0) +#define ASSERT_CPU_VEC3_EQ(v, ex, ey, ez) \ + do { \ + if (fabsf((v).x - (ex)) > EPSILON || \ + fabsf((v).y - (ey)) > EPSILON || \ + fabsf((v).z - (ez)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f)\n", \ + __FILE__, __LINE__, (float)(ex), (float)(ey), (float)(ez), \ + (v).x, (v).y, (v).z); \ + return 1; \ + } \ + } while(0) +#define ASSERT_CPU_VEC4_EQ(v, ex, ey, ez, ew) \ + do { \ + if (fabsf((v).x - (ex)) > EPSILON || \ + fabsf((v).y - (ey)) > EPSILON || \ + fabsf((v).z - (ez)) > EPSILON || \ + fabsf((v).w - (ew)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f, %.6f)\n", \ + __FILE__, __LINE__, \ + (float)(ex), (float)(ey), (float)(ez), (float)(ew), \ + (v).x, (v).y, (v).z, (v).w); \ + return 1; \ + } \ + } while(0) +#endif // USE_CPU_COMPAT + +// Assertions for cpu_vec_4 .x/.y/.z/.w (available in both builds — needed +// by the VU0 build to test the new VU0-asm fast paths in cpu_vector.h / +// cpu_matrix.h). +#ifndef USE_CPU_COMPAT +#define ASSERT_FLOAT_EQ(a, b) \ + do { \ + if (fabsf((float)(a) - (float)(b)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected %.6f, got %.6f\n", __FILE__, __LINE__, (float)(b), (float)(a)); \ + return 1; \ + } \ + } while(0) +#define ASSERT_CPU_VEC4_EQ(v, ex, ey, ez, ew) \ + do { \ + if (fabsf((v).x - (ex)) > EPSILON || \ + fabsf((v).y - (ey)) > EPSILON || \ + fabsf((v).z - (ez)) > EPSILON || \ + fabsf((v).w - (ew)) > EPSILON) { \ + printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f, %.6f)\n", \ + __FILE__, __LINE__, \ + (float)(ex), (float)(ey), (float)(ez), (float)(ew), \ + (v).x, (v).y, (v).z, (v).w); \ + return 1; \ + } \ + } while(0) +#endif // !USE_CPU_COMPAT + +static int test_count = 0; +static const char* failed_tests[200]; +static int failed_count = 0; + +#define RUN_TEST(test_func) \ + do { \ + test_count++; \ + printf("Running %s...\n", #test_func); \ + int result = test_func(); \ + if (result != 0) { \ + printf("TEST FAILED: %s\n\n", #test_func); \ + if (failed_count < 200) { \ + failed_tests[failed_count++] = #test_func; \ + } \ + } else { \ + printf("PASS: %s\n\n", #test_func); \ + } \ + } while(0) + +// Emulator detection helper +// Returns 1 if the VU multiply-1 bug (1 * X != X) is present (i.e. hardware), +// or 0 if the environment behaves like common emulators. +int isVUMulErrorPresent() +{ + float in = 129.5f; + float out = 0.0f; + asm __volatile__( + "QMTC2 %1, $vf1\n\t" // Set VF1 to 129.5f + "VADDw $vf2, $vf0, $vf0w\n\t" // VF2 = vf0[w] = 1 + "VMUL $vf1, $vf2, $vf1\n\t" // VF1 = 1 * 129.5f + "QMFC2 %0, $vf1\n\t" // Load result back + : "=r"(out) + : "r"(in) + : "memory"); + return (in != out); +} + +// Global flag set at test startup +int g_vu_mul_emulator_bug = -1; + +// Test functions + +int test_vec3_addition() +{ + vec_xyz v1(1.0f, 2.0f, 3.0f); + vec_xyz v2(4.0f, 5.0f, 6.0f); + vec_xyz result = v1 + v2; + ASSERT_VEC3_EQ(result, 5.0f, 7.0f, 9.0f); + return 0; +} + +int test_vec3_subtraction() +{ + vec_xyz v1(5.0f, 7.0f, 9.0f); + vec_xyz v2(1.0f, 2.0f, 3.0f); + vec_xyz result = v1 - v2; + ASSERT_VEC3_EQ(result, 4.0f, 5.0f, 6.0f); + return 0; +} + +int test_vec3_multiplication() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(2.0f, 2.0f, 2.0f); + vec_xyz result = v1 * v2; + ASSERT_VEC3_EQ(result, 4.0f, 6.0f, 8.0f); + return 0; +} + +int test_vec3_scalar_multiplication() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz result = v1 * 2.0f; + ASSERT_VEC3_EQ(result, 4.0f, 6.0f, 8.0f); + return 0; +} + +int test_vec3_division() +{ + vec_xyz v1(8.0f, 6.0f, 4.0f); + vec_xyz v2(2.0f, 2.0f, 2.0f); + vec_xyz result = v1 / v2; + ASSERT_VEC3_EQ(result, 4.0f, 3.0f, 2.0f); + return 0; +} + +int test_vec3_negate() +{ + vec_xyz v1(1.0f, 2.0f, 3.0f); + vec_xyz result = -v1; + ASSERT_VEC3_EQ(result, -1.0f, -2.0f, -3.0f); + return 0; +} + +int test_vec3_abs() +{ + vec_xyz v1(-1.0f, 2.0f, -3.0f); + vec_xyz result = v1.abs(); + ASSERT_VEC3_EQ(result, 1.0f, 2.0f, 3.0f); + return 0; +} + +int test_vec3_max() +{ + vec_xyz v1(1.0f, 5.0f, 3.0f); + vec_xyz v2(4.0f, 2.0f, 6.0f); + vec_xyz result = v1.max(v2); + ASSERT_VEC3_EQ(result, 4.0f, 5.0f, 6.0f); + return 0; +} + +int test_vec3_min() +{ + vec_xyz v1(1.0f, 5.0f, 3.0f); + vec_xyz v2(4.0f, 2.0f, 6.0f); + vec_xyz result = v1.min(v2); + ASSERT_VEC3_EQ(result, 1.0f, 2.0f, 3.0f); + return 0; +} + +int test_vec3_normalized() +{ + vec_xyz v1(3.0f, 4.0f, 0.0f); // Length = 5 + vec_xyz result = v1.normalized(); + float rx = (float)vec_x(result); + float ry = (float)vec_y(result); + float rz = (float)vec_z(result); + float len = sqrtf(rx * rx + ry * ry + rz * rz); + ASSERT_EQ(len, 1.0f); + ASSERT_VEC3_EQ(result, 0.6f, 0.8f, 0.0f); + return 0; +} + +int test_vec3_set_zero() +{ + vec_xyz v1(1.0f, 2.0f, 3.0f); + v1.set_zero(); + ASSERT_VEC3_EQ(v1, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_vec3_set() +{ + vec_xyz v1; + v1.set(5.0f); + ASSERT_VEC3_EQ(v1, 5.0f, 5.0f, 5.0f); + return 0; +} + +int test_vec3_set_vec() +{ + vec_xyz v1(1.0f, 2.0f, 3.0f); + vec_xyz v2; + v2.set(v1); + ASSERT_VEC3_EQ(v2, 1.0f, 2.0f, 3.0f); + return 0; +} + +int test_vec4_addition() +{ + vec_xyzw v1(1.0f, 2.0f, 3.0f, 4.0f); + vec_xyzw v2(5.0f, 6.0f, 7.0f, 8.0f); + vec_xyzw result = v1 + v2; + ASSERT_VEC_EQ(result, 6.0f, 8.0f, 10.0f, 12.0f); + return 0; +} + +int test_vec4_subtraction() +{ + vec_xyzw v1(10.0f, 9.0f, 8.0f, 7.0f); + vec_xyzw v2(1.0f, 2.0f, 3.0f, 4.0f); + vec_xyzw result = v1 - v2; + ASSERT_VEC_EQ(result, 9.0f, 7.0f, 5.0f, 3.0f); + return 0; +} + +int test_vec4_scalar_multiplication() +{ + vec_xyzw v1(1.0f, 2.0f, 3.0f, 4.0f); + vec_xyzw result = v1 * 3.0f; + ASSERT_VEC_EQ(result, 3.0f, 6.0f, 9.0f, 12.0f); + return 0; +} + +int test_mat33_set_zero() +{ + mat_33 m; + m.set_zero(); + vec_xyz col0 = m.get_col0(); + vec_xyz col1 = m.get_col1(); + vec_xyz col2 = m.get_col2(); + ASSERT_VEC3_EQ(col0, 0.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(col1, 0.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(col2, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat33_set_row() +{ + mat_33 m; + m.set_zero(); + vec_xyz row0(1.0f, 2.0f, 3.0f); + m.set_row0(row0); + vec_xyz col0 = m.get_col0(); + vec_xyz col1 = m.get_col1(); + vec_xyz col2 = m.get_col2(); + ASSERT_VEC3_EQ(col0, 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(col1, 2.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(col2, 3.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat33_get_row() +{ + mat_33 m; + m.set_zero(); + vec_xyz row0(1.0f, 2.0f, 3.0f); + m.set_row0(row0); + vec_xyz retrieved_row0 = m.get_row0(); + ASSERT_VEC3_EQ(retrieved_row0, 1.0f, 2.0f, 3.0f); + return 0; +} + +int test_mat33_multiply_vector() +{ + mat_33 m; + m.set_zero(); + m.set_row0(vec_xyz(1.0f, 0.0f, 0.0f)); + m.set_row1(vec_xyz(0.0f, 2.0f, 0.0f)); + m.set_row2(vec_xyz(0.0f, 0.0f, 3.0f)); + + vec_xyz v(1.0f, 2.0f, 3.0f); + vec_xyz result = m * v; + ASSERT_VEC3_EQ(result, 1.0f, 4.0f, 9.0f); + return 0; +} + +int test_mat44_set_zero() +{ + mat_44 m; + m.set_zero(); + vec_xyzw col0 = m.get_col0(); + vec_xyzw col1 = m.get_col1(); + vec_xyzw col2 = m.get_col2(); + vec_xyzw col3 = m.get_col3(); + ASSERT_VEC_EQ(col0, 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col1, 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col2, 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col3, 0.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat44_set_row() +{ + mat_44 m; + m.set_zero(); + vec_xyzw row0(1.0f, 2.0f, 3.0f, 4.0f); + m.set_row0(row0); + vec_xyzw col0 = m.get_col0(); + vec_xyzw col1 = m.get_col1(); + vec_xyzw col2 = m.get_col2(); + vec_xyzw col3 = m.get_col3(); + ASSERT_VEC_EQ(col0, 1.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col1, 2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col2, 3.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col3, 4.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat44_multiply_vector() +{ + mat_44 m; + m.set_zero(); + m.set_row0(vec_xyzw(1.0f, 0.0f, 0.0f, 0.0f)); + m.set_row1(vec_xyzw(0.0f, 2.0f, 0.0f, 0.0f)); + m.set_row2(vec_xyzw(0.0f, 0.0f, 3.0f, 0.0f)); + m.set_row3(vec_xyzw(0.0f, 0.0f, 0.0f, 4.0f)); + + vec_xyzw v(1.0f, 2.0f, 3.0f, 4.0f); + vec_xyzw result = m * v; + ASSERT_VEC_EQ(result, 1.0f, 4.0f, 9.0f, 16.0f); + return 0; +} + +#ifndef USE_CPU_COMPAT +int test_accumulator_operations() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(1.0f, 1.0f, 1.0f); + + // Test to_a + v1.to_a(); + + // Test mula + v2.mula(v2); + + // Test aadd + vec_xyz result = v1.aadd(); + // The accumulator should contain v1, so aadd should add v1 to itself + // This is a basic test to ensure the accumulator functions compile and run + return 0; +} +#endif // !USE_CPU_COMPAT + +// Test normalized with zero vector (branch) +int test_vec3_normalized_zero() +{ + vec_xyz v(0.0f, 0.0f, 0.0f); + vec_xyz result = v.normalized(); + ASSERT_VEC3_EQ(result, 0.0f, 0.0f, 0.0f); + return 0; +} + +// Test truncate_length branch (vector shorter than max) +int test_vec3_truncate_length_shorter() +{ + vec_xyz v(1.0f, 2.0f, 0.0f); // Length = sqrt(5) ≈ 2.236 + vec_xyz result = v.truncate_length(5.0f); + ASSERT_VEC3_EQ(result, 1.0f, 2.0f, 0.0f); // Should return unchanged + return 0; +} + +// Test truncate_length branch (vector longer than max) +int test_vec3_truncate_length_longer() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); // Length = 5 + vec_xyz result = v.truncate_length(3.0f); // Max length = 3 + float len = result.length(); + ASSERT_EQ(len, 3.0f); + return 0; +} + +// Test is_zero +int test_vec3_is_zero() +{ + vec_xyz v1(0.0f, 0.0f, 0.0f); + vec_xyz v2(1.0f, 0.0f, 0.0f); + int zero1 = v1.is_zero(); + int zero2 = v2.is_zero(); + if (zero1 != 1) return 1; + if (zero2 != 0) return 1; + return 0; +} + +// Test sign function +int test_vec3_sign() +{ + vec_xyz v1(5.0f, -3.0f, 0.0f); + vec_xyz result = v1.sign(); + float rx = (float)vec_x(result); + float ry = (float)vec_y(result); + float rz = (float)vec_z(result); + if (rx < 0.9f || rx > 1.1f) return 1; // Should be ~1.0 + if (ry > -0.9f || ry < -1.1f) return 1; // Should be ~-1.0 + if (fabsf(rz) > 0.1f) return 1; // Should be ~0.0 + return 0; +} + +// Test one_over - commented out due to incompatible assembly constraints ("j" constraint) +int test_vec3_one_over() +{ + vec_xyz v(2.0f, 4.0f, 8.0f); + vec_xyz result = v.one_over(); + float rx = (float)vec_x(result); + float ry = (float)vec_y(result); + float rz = (float)vec_z(result); + ASSERT_EQ(rx, 0.5f); + ASSERT_EQ(ry, 0.25f); + ASSERT_EQ(rz, 0.125f); + return 0; +} + +// Test interpolate +int test_vec3_interpolate() +{ + vec_xyz v1(0.0f, 0.0f, 0.0f); + vec_xyz v2(10.0f, 20.0f, 30.0f); + vec_xyz result = v1.interpolate(0.5f, v2); + ASSERT_VEC3_EQ(result, 5.0f, 10.0f, 15.0f); + return 0; +} + +// Test distance_from +int test_vec3_distance_from() +{ + vec_xyz v1(0.0f, 0.0f, 0.0f); + vec_xyz v2(3.0f, 4.0f, 0.0f); + float dist = (float)v1.distance_from(v2); + ASSERT_EQ(dist, 5.0f); + return 0; +} + +// Test set_length +int test_vec3_set_length() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); // Length = 5 + vec_xyz result = v.set_length(10.0f); // Set to length 10 + float len = result.length(); + ASSERT_EQ(len, 10.0f); + ASSERT_VEC3_EQ(result, 6.0f, 8.0f, 0.0f); // Scaled by 2 + return 0; +} + +// Test parallel_component +int test_vec3_parallel_component() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); + vec_xyz basis(1.0f, 0.0f, 0.0f); // Unit vector in x direction + vec_xyz result = v.parallel_component(basis); + ASSERT_VEC3_EQ(result, 3.0f, 0.0f, 0.0f); + return 0; +} + +// Test perpendicular_component +int test_vec3_perpendicular_component() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); + vec_xyz basis(1.0f, 0.0f, 0.0f); // Unit vector in x direction + vec_xyz result = v.perpendicular_component(basis); + ASSERT_VEC3_EQ(result, 0.0f, 4.0f, 0.0f); + return 0; +} + +// Test normalize (in-place) +int test_vec3_normalize() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); + v.normalize(); + float len = v.length(); + ASSERT_EQ(len, 1.0f); + ASSERT_VEC3_EQ(v, 0.6f, 0.8f, 0.0f); + return 0; +} + +#ifndef USE_CPU_COMPAT +// Test accumulator: to_a and from_a +int test_vec3_to_a_from_a() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + v1.to_a(); // ACC = v1 + vec_xyz v2(0.0f, 0.0f, 0.0f); + v2.from_a(); // v2 = ACC + ASSERT_VEC3_EQ(v2, 2.0f, 3.0f, 4.0f); + return 0; +} + +// Test accumulator: aadd +int test_vec3_aadd() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(1.0f, 1.0f, 1.0f); + v1.to_a(); // ACC = v1 + vec_xyz result = v2.aadd(); // result = ACC + v2 = v1 + v2 + ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f); + return 0; +} + +// Test accumulator: asub +int test_vec3_asub() +{ + vec_xyz v1(5.0f, 5.0f, 5.0f); + vec_xyz v2(2.0f, 3.0f, 4.0f); + v1.to_a(); // ACC = v1 + vec_xyz result = v2.asub(); // result = ACC - v2 = v1 - v2 + ASSERT_VEC3_EQ(result, 3.0f, 2.0f, 1.0f); + return 0; +} + +// Test accumulator: madd +int test_vec3_madd() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(1.0f, 1.0f, 1.0f); + vec_xyz v3(1.0f, 1.0f, 1.0f); + v1.to_a(); // ACC = v1 + vec_xyz result = v2.madd(v3); // result = ACC + v2 * v3 = v1 + v2 * v3 + ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f); + return 0; +} + +// Test accumulator: msub +int test_vec3_msub() +{ + vec_xyz v1(5.0f, 5.0f, 5.0f); + vec_xyz v2(2.0f, 2.0f, 2.0f); + vec_xyz v3(1.0f, 1.0f, 1.0f); + v1.to_a(); // ACC = v1 + vec_xyz result = v2.msub(v3); // result = ACC - v2 * v3 = v1 - v2 * v3 + ASSERT_VEC3_EQ(result, 3.0f, 3.0f, 3.0f); + return 0; +} + +// Test accumulator: madda +int test_vec3_madda() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(1.0f, 1.0f, 1.0f); + v1.to_a(); // ACC = v1 + v2.madda(v2); // ACC = ACC + v2 * v2 = v1 + v2 * v2 + vec_xyz result; + result.from_a(); // result = ACC + ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f); + return 0; +} + +// Test accumulator: msuba +int test_vec3_msuba() +{ + vec_xyz v1(5.0f, 5.0f, 5.0f); + vec_xyz v2(2.0f, 2.0f, 2.0f); + v1.to_a(); // ACC = v1 + v2.msuba(v2); // ACC = ACC - v2 * v2 = v1 - v2 * v2 + vec_xyz result; + result.from_a(); // result = ACC + ASSERT_VEC3_EQ(result, 1.0f, 1.0f, 1.0f); + return 0; +} + +// Test accumulator: aadda - COMMENTED OUT: failing test +int test_vec3_aadda() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(1.0f, 1.0f, 1.0f); + v1.to_a(); // ACC = v1 + vec_xyz result = v2.aadda(); // result = ACC + v2, ACC = ACC + v2 + ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f); + return 0; +} + +// Test accumulator: asuba +int test_vec3_asuba() +{ + vec_xyz v1(5.0f, 5.0f, 5.0f); + vec_xyz v2(2.0f, 3.0f, 4.0f); + v1.to_a(); // ACC = v1 + vec_xyz result = v2.asuba(); // result = ACC - v2, ACC = ACC - v2 + ASSERT_VEC3_EQ(result, 3.0f, 2.0f, 1.0f); + return 0; +} +#endif // !USE_CPU_COMPAT + +int test_vec3_operator_plus_equals() +{ + vec_xyz v1(1.0f, 2.0f, 3.0f); + vec_xyz v2(4.0f, 5.0f, 6.0f); + v1 += v2; + ASSERT_VEC3_EQ(v1, 5.0f, 7.0f, 9.0f); + return 0; +} + +int test_vec3_operator_minus_equals() +{ + vec_xyz v1(10.0f, 9.0f, 8.0f); + vec_xyz v2(1.0f, 2.0f, 3.0f); + v1 -= v2; + ASSERT_VEC3_EQ(v1, 9.0f, 7.0f, 5.0f); + return 0; +} + +int test_vec3_operator_multiply_equals() +{ + vec_xyz v1(2.0f, 3.0f, 4.0f); + vec_xyz v2(2.0f, 2.0f, 2.0f); + v1 *= v2; + ASSERT_VEC3_EQ(v1, 4.0f, 6.0f, 8.0f); + return 0; +} + +int test_vec3_dot_product() +{ + vec_xyz v1(1.0f, 2.0f, 3.0f); + vec_xyz v2(4.0f, 5.0f, 6.0f); + float dot = v1.dot(v2); + ASSERT_EQ(dot, 32.0f); // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32 + return 0; +} + +int test_vec3_length() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); + float len = v.length(); + ASSERT_EQ(len, 5.0f); + return 0; +} + +int test_vec3_length_sqr() +{ + vec_xyz v(3.0f, 4.0f, 0.0f); + float len_sqr = v.length_sqr(); + ASSERT_EQ(len_sqr, 25.0f); + return 0; +} + +int test_vec3_cross_product() +{ + vec_xyz v1(1.0f, 0.0f, 0.0f); + vec_xyz v2(0.0f, 1.0f, 0.0f); + vec_xyz result = v1.cross(v2); + ASSERT_VEC3_EQ(result, 0.0f, 0.0f, 1.0f); + return 0; +} + +// Matrix tests +int test_mat33_set_identity() +{ + mat_33 m; + m.set_identity(); + vec_xyz col0 = m.get_col0(); + vec_xyz col1 = m.get_col1(); + vec_xyz col2 = m.get_col2(); + ASSERT_VEC3_EQ(col0, 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(col1, 0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(col2, 0.0f, 0.0f, 1.0f); + return 0; +} + +int test_mat33_set_scale() +{ + mat_33 m; + vec_xyz scale(2.0f, 3.0f, 4.0f); + m.set_scale(scale); + vec_xyz col0 = m.get_col0(); + vec_xyz col1 = m.get_col1(); + vec_xyz col2 = m.get_col2(); + ASSERT_VEC3_EQ(col0, 2.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(col1, 0.0f, 3.0f, 0.0f); + ASSERT_VEC3_EQ(col2, 0.0f, 0.0f, 4.0f); + return 0; +} + +int test_mat33_transpose() +{ + mat_33 m; + m.set_row0(vec_xyz(1.0f, 2.0f, 3.0f)); + m.set_row1(vec_xyz(4.0f, 5.0f, 6.0f)); + m.set_row2(vec_xyz(7.0f, 8.0f, 9.0f)); + mat_33 transposed = m.transpose(); + vec_xyz row0 = transposed.get_row0(); + vec_xyz row1 = transposed.get_row1(); + vec_xyz row2 = transposed.get_row2(); + ASSERT_VEC3_EQ(row0, 1.0f, 4.0f, 7.0f); + ASSERT_VEC3_EQ(row1, 2.0f, 5.0f, 8.0f); + ASSERT_VEC3_EQ(row2, 3.0f, 6.0f, 9.0f); + return 0; +} + +int test_mat33_get_row1() +{ + mat_33 m; + m.set_zero(); + vec_xyz row1(10.0f, 20.0f, 30.0f); + m.set_row1(row1); + vec_xyz retrieved_row1 = m.get_row1(); + ASSERT_VEC3_EQ(retrieved_row1, 10.0f, 20.0f, 30.0f); + return 0; +} + +int test_mat33_get_row2() +{ + mat_33 m; + m.set_zero(); + vec_xyz row2(100.0f, 200.0f, 300.0f); + m.set_row2(row2); + vec_xyz retrieved_row2 = m.get_row2(); + ASSERT_VEC3_EQ(retrieved_row2, 100.0f, 200.0f, 300.0f); + return 0; +} + +int test_mat44_set_identity() +{ + mat_44 m; + m.set_identity(); + vec_xyzw col0 = m.get_col0(); + vec_xyzw col1 = m.get_col1(); + vec_xyzw col2 = m.get_col2(); + vec_xyzw col3 = m.get_col3(); + ASSERT_VEC_EQ(col0, 1.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col1, 0.0f, 1.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col2, 0.0f, 0.0f, 1.0f, 0.0f); + ASSERT_VEC_EQ(col3, 0.0f, 0.0f, 0.0f, 1.0f); + return 0; +} + +int test_mat44_set_scale() +{ + mat_44 m; + vec_xyz scale(2.0f, 3.0f, 4.0f); + m.set_scale(scale); + vec_xyzw col0 = m.get_col0(); + vec_xyzw col1 = m.get_col1(); + vec_xyzw col2 = m.get_col2(); + vec_xyzw col3 = m.get_col3(); + ASSERT_VEC_EQ(col0, 2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col1, 0.0f, 3.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(col2, 0.0f, 0.0f, 4.0f, 0.0f); + ASSERT_VEC_EQ(col3, 0.0f, 0.0f, 0.0f, 1.0f); + return 0; +} + +int test_mat44_set_translate() +{ + mat_44 m; + vec_xyz trans(10.0f, 20.0f, 30.0f); + m.set_translate(trans); + vec_xyzw col3 = m.get_col3(); + ASSERT_VEC_EQ(col3, 10.0f, 20.0f, 30.0f, 1.0f); + return 0; +} + +int test_mat44_get_row1() +{ + mat_44 m; + m.set_zero(); + vec_xyzw row1(10.0f, 20.0f, 30.0f, 40.0f); + m.set_row1(row1); + vec_xyzw retrieved_row1 = m.get_row1(); + ASSERT_VEC_EQ(retrieved_row1, 10.0f, 20.0f, 30.0f, 40.0f); + return 0; +} + +int test_mat44_get_row2() +{ + mat_44 m; + m.set_zero(); + vec_xyzw row2(100.0f, 200.0f, 300.0f, 400.0f); + m.set_row2(row2); + vec_xyzw retrieved_row2 = m.get_row2(); + ASSERT_VEC_EQ(retrieved_row2, 100.0f, 200.0f, 300.0f, 400.0f); + return 0; +} + +int test_mat44_get_row3() +{ + mat_44 m; + m.set_zero(); + vec_xyzw row3(1000.0f, 2000.0f, 3000.0f, 4000.0f); + m.set_row3(row3); + vec_xyzw retrieved_row3 = m.get_row3(); + ASSERT_VEC_EQ(retrieved_row3, 1000.0f, 2000.0f, 3000.0f, 4000.0f); + return 0; +} + +int test_mat44_transpose() +{ + mat_44 m; + m.set_zero(); // Initialize matrix first + m.set_row0(vec_xyzw(1.0f, 2.0f, 3.0f, 4.0f)); + m.set_row1(vec_xyzw(5.0f, 6.0f, 7.0f, 8.0f)); + m.set_row2(vec_xyzw(9.0f, 10.0f, 11.0f, 12.0f)); + m.set_row3(vec_xyzw(13.0f, 14.0f, 15.0f, 16.0f)); + mat_44 transposed = m.transpose(); + vec_xyzw row0 = transposed.get_row0(); + vec_xyzw row1 = transposed.get_row1(); + vec_xyzw row2 = transposed.get_row2(); + vec_xyzw row3 = transposed.get_row3(); + ASSERT_VEC_EQ(row0, 1.0f, 5.0f, 9.0f, 13.0f); + ASSERT_VEC_EQ(row1, 2.0f, 6.0f, 10.0f, 14.0f); + ASSERT_VEC_EQ(row2, 3.0f, 7.0f, 11.0f, 15.0f); + ASSERT_VEC_EQ(row3, 4.0f, 8.0f, 12.0f, 16.0f); + return 0; +} + +// Test vec_xyzw normalized3 and normalize3 +int test_vec4_normalized3() +{ + vec_xyzw v(3.0f, 4.0f, 0.0f, 0.0f); // Length = 5 + vec_xyzw result = v.normalized3(); + float rx = (float)vec_x(result); + float ry = (float)vec_y(result); + float rz = (float)vec_z(result); + float len = sqrtf(rx * rx + ry * ry + rz * rz); + ASSERT_EQ(len, 1.0f); + return 0; +} + +#ifndef USE_CPU_COMPAT +// Test VU0 copy functions +int test_vu0_copy_qwords() +{ + // Test CopyQwordsToVU0 and CopyQwordsFromVU0 + uint128_t src[4]; + uint128_t dst[4]; + + // Initialize source data + memset(src, 0, sizeof(src)); + memset(dst, 0, sizeof(dst)); + + // Set up test data (simple pattern) + uint32_t* src_ptr = (uint32_t*)src; + for (int i = 0; i < 16; i++) { + src_ptr[i] = i + 1; + } + + // Copy to VU0 + VU0::CopyQwordsToVU0(0, src, 4); + + // Copy back from VU0 + VU0::CopyQwordsFromVU0(dst, 0, 4); + + // Verify data matches + for (int i = 0; i < 16; i++) { + if (src_ptr[i] != ((uint32_t*)dst)[i]) { + return 1; + } + } + + return 0; +} + +int test_vu0_copy_even_qwords() +{ + // Test CopyEvenQwordsToVU0 and CopyEvenQwordsFromVU0 + uint128_t src[4]; + uint128_t dst[4]; + + // Initialize source data + memset(src, 0, sizeof(src)); + memset(dst, 0, sizeof(dst)); + + // Set up test data + uint32_t* src_ptr = (uint32_t*)src; + for (int i = 0; i < 16; i++) { + src_ptr[i] = i + 100; + } + + // Copy even qwords to VU0 (must be even number) + VU0::CopyEvenQwordsToVU0(0, src, 4); + + // Copy back from VU0 + VU0::CopyEvenQwordsFromVU0(dst, 0, 4); + + // Verify data matches + for (int i = 0; i < 16; i++) { + if (src_ptr[i] != ((uint32_t*)dst)[i]) { + return 1; + } + } + + return 0; +} +#endif // !USE_CPU_COMPAT + +int test_vec4_normalize3() +{ + vec_xyzw v(3.0f, 4.0f, 0.0f, 5.0f); + v.normalize3(); + float rx = (float)vec_x(v); + float ry = (float)vec_y(v); + float rz = (float)vec_z(v); + float len = sqrtf(rx * rx + ry * ry + rz * rz); + ASSERT_EQ(len, 1.0f); + return 0; +} + +// ---- mat_43 tests ---- + +int test_mat43_multiply_vector() +{ + // mat_43 * vec_3 = vec_4 (column-major: result = col0*v.x + col1*v.y + col2*v.z) + mat_43 m(vec_4(2.0f, 1.0f, 0.0f, 3.0f), + vec_4(0.0f, 3.0f, 1.0f, 2.0f), + vec_4(1.0f, 0.0f, 4.0f, 1.0f)); + vec_3 v(1.0f, 2.0f, 3.0f); + vec_4 r = m * v; + // r = col0*1 + col1*2 + col2*3 + // = (2,1,0,3) + (0,6,2,4) + (3,0,12,3) = (5,7,14,10) + ASSERT_VEC_EQ(r, 5.0f, 7.0f, 14.0f, 10.0f); + return 0; +} + +// ---- mat_34 tests ---- + +int test_mat34_multiply_vector() +{ + // mat_34 * vec_4 = vec_3 (result = col0*v.x + col1*v.y + col2*v.z + col3*v.w) + mat_34 m(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(5.0f, 6.0f, 7.0f)); + vec_4 v(1.0f, 2.0f, 3.0f, 1.0f); + vec_3 r = m * v; + // r = (1,0,0)*1 + (0,1,0)*2 + (0,0,1)*3 + (5,6,7)*1 = (6,8,10) + ASSERT_VEC3_EQ(r, 6.0f, 8.0f, 10.0f); + return 0; +} + +// ---- mat_33 matrix * matrix ---- + +int test_mat33_multiply_mat33() +{ + // A * I = A + mat_33 A; + A.col0.set(1.0f, 2.0f, 3.0f); + A.col1.set(4.0f, 5.0f, 6.0f); + A.col2.set(7.0f, 8.0f, 9.0f); + mat_33 I; + I.set_identity(); + mat_33 R = A * I; + // R should equal A + ASSERT_VEC3_EQ(R.get_row0(), 1.0f, 4.0f, 7.0f); + ASSERT_VEC3_EQ(R.get_row1(), 2.0f, 5.0f, 8.0f); + ASSERT_VEC3_EQ(R.get_row2(), 3.0f, 6.0f, 9.0f); + return 0; +} + +int test_mat33_multiply_mat33_nontrivial() +{ + // A * B where neither is identity + mat_33 A, B; + A.col0.set(1.0f, 2.0f, 3.0f); + A.col1.set(4.0f, 5.0f, 6.0f); + A.col2.set(7.0f, 8.0f, 9.0f); + B.col0.set(9.0f, 8.0f, 7.0f); + B.col1.set(6.0f, 5.0f, 4.0f); + B.col2.set(3.0f, 2.0f, 1.0f); + mat_33 C = A * B; + // C_col0 = A * B_col0 = A*(9,8,7) = col0*9+col1*8+col2*7 + // = (1,2,3)*9+(4,5,6)*8+(7,8,9)*7 = (9+32+49, 18+40+56, 27+48+63) = (90,114,138) + // C_col1 = A*(6,5,4) = (1,2,3)*6+(4,5,6)*5+(7,8,9)*4 = (6+20+28, 12+25+32, 18+30+36) = (54,69,84) + // C_col2 = A*(3,2,1) = (1,2,3)*3+(4,5,6)*2+(7,8,9)*1 = (3+8+7, 6+10+8, 9+12+9) = (18,24,30) + ASSERT_VEC3_EQ(C.get_row0(), 90.0f, 54.0f, 18.0f); + ASSERT_VEC3_EQ(C.get_row1(), 114.0f, 69.0f, 24.0f); + ASSERT_VEC3_EQ(C.get_row2(), 138.0f, 84.0f, 30.0f); + return 0; +} + +// ---- mat_44 matrix * matrix ---- + +int test_mat44_multiply_mat44() +{ + // A * I = A + mat_44 A; + A.col0.set(1.0f, 0.0f, 0.0f, 0.0f); + A.col1.set(0.0f, 2.0f, 0.0f, 0.0f); + A.col2.set(0.0f, 0.0f, 3.0f, 0.0f); + A.col3.set(4.0f, 5.0f, 6.0f, 1.0f); + mat_44 I; + I.set_identity(); + mat_44 R = A * I; + ASSERT_VEC_EQ(R.get_row0(), 1.0f, 0.0f, 0.0f, 4.0f); + ASSERT_VEC_EQ(R.get_row1(), 0.0f, 2.0f, 0.0f, 5.0f); + ASSERT_VEC_EQ(R.get_row2(), 0.0f, 0.0f, 3.0f, 6.0f); + ASSERT_VEC_EQ(R.get_row3(), 0.0f, 0.0f, 0.0f, 1.0f); + return 0; +} + +// ---- matrix negate ---- + +int test_mat33_negate() +{ + mat_33 m; + m.col0.set(1.0f, 2.0f, 3.0f); + m.col1.set(4.0f, 5.0f, 6.0f); + m.col2.set(7.0f, 8.0f, 9.0f); + mat_33 n = -m; + ASSERT_VEC3_EQ(n.get_row0(), -1.0f, -4.0f, -7.0f); + ASSERT_VEC3_EQ(n.get_row1(), -2.0f, -5.0f, -8.0f); + ASSERT_VEC3_EQ(n.get_row2(), -3.0f, -6.0f, -9.0f); + return 0; +} + +int test_mat44_negate() +{ + mat_44 M; + M.set_identity(); + M.col3.set(1.0f, 2.0f, 3.0f, 4.0f); + mat_44 N = -M; + ASSERT_VEC_EQ(N.get_row0(), -1.0f, 0.0f, 0.0f, -1.0f); + ASSERT_VEC_EQ(N.get_row1(), 0.0f, -1.0f, 0.0f, -2.0f); + ASSERT_VEC_EQ(N.get_row2(), 0.0f, 0.0f, -1.0f, -3.0f); + ASSERT_VEC_EQ(N.get_row3(), 0.0f, 0.0f, 0.0f, -4.0f); + return 0; +} + +// ---- mat_43 basic construction and row access ---- + +int test_mat43_set_zero() +{ + mat_43 m; + m.set_zero(); + ASSERT_VEC3_EQ(m.get_row0(), 0.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(m.get_row1(), 0.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(m.get_row2(), 0.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(m.get_row3(), 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat43_get_rows() +{ + mat_43 m(vec_4(1.0f, 2.0f, 3.0f, 4.0f), + vec_4(5.0f, 6.0f, 7.0f, 8.0f), + vec_4(9.0f, 10.0f, 11.0f, 12.0f)); + // row0 = (col0.x, col1.x, col2.x) = (1,5,9) + ASSERT_VEC3_EQ(m.get_row0(), 1.0f, 5.0f, 9.0f); + // row1 = (col0.y, col1.y, col2.y) = (2,6,10) + ASSERT_VEC3_EQ(m.get_row1(), 2.0f, 6.0f, 10.0f); + // row2 = (col0.z, col1.z, col2.z) = (3,7,11) + ASSERT_VEC3_EQ(m.get_row2(), 3.0f, 7.0f, 11.0f); + // row3 = (col0.w, col1.w, col2.w) = (4,8,12) + ASSERT_VEC3_EQ(m.get_row3(), 4.0f, 8.0f, 12.0f); + return 0; +} + +// ---- mat_34 basic construction and row access ---- + +int test_mat34_set_zero() +{ + mat_34 m; + m.set_zero(); + ASSERT_VEC_EQ(m.get_row0(), 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(m.get_row1(), 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(m.get_row2(), 0.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat34_get_rows() +{ + mat_34 m(vec_3(1.0f, 2.0f, 3.0f), + vec_3(4.0f, 5.0f, 6.0f), + vec_3(7.0f, 8.0f, 9.0f), + vec_3(10.0f, 11.0f, 12.0f)); + // row0 = (col0.x, col1.x, col2.x, col3.x) = (1,4,7,10) + ASSERT_VEC_EQ(m.get_row0(), 1.0f, 4.0f, 7.0f, 10.0f); + // row1 = (col0.y, col1.y, col2.y, col3.y) = (2,5,8,11) + ASSERT_VEC_EQ(m.get_row1(), 2.0f, 5.0f, 8.0f, 11.0f); + // row2 = (col0.z, col1.z, col2.z, col3.z) = (3,6,9,12) + ASSERT_VEC_EQ(m.get_row2(), 3.0f, 6.0f, 9.0f, 12.0f); + return 0; +} + +// ---- mat_43 transpose ---- + +int test_mat43_transpose() +{ + // transpose of mat_43 should equal mat_34 + mat_43 m(vec_4(1.0f, 2.0f, 3.0f, 4.0f), + vec_4(5.0f, 6.0f, 7.0f, 8.0f), + vec_4(9.0f, 10.0f, 11.0f, 12.0f)); + mat_34 t = m.transpose(); + // t.col_i should be the i-th row of m + // t.col0 (vec_3) = row0 of m = (1,5,9) + // t.col1 = row1 = (2,6,10), t.col2 = row2 = (3,7,11), t.col3 = row3 = (4,8,12) + ASSERT_VEC_EQ(t.get_row0(), 1.0f, 2.0f, 3.0f, 4.0f); + ASSERT_VEC_EQ(t.get_row1(), 5.0f, 6.0f, 7.0f, 8.0f); + ASSERT_VEC_EQ(t.get_row2(), 9.0f, 10.0f, 11.0f, 12.0f); + return 0; +} + +// ---- mat_34 transpose ---- + +int test_mat34_transpose() +{ + mat_34 m(vec_3(1.0f, 2.0f, 3.0f), + vec_3(4.0f, 5.0f, 6.0f), + vec_3(7.0f, 8.0f, 9.0f), + vec_3(10.0f, 11.0f, 12.0f)); + mat_43 t = m.transpose(); + // t row0 = col0 of original m viewed as row = (1,4,7,10) + // But t is mat_43 (4 rows, 3 cols after transpose) + // t.get_row0() = (col0.x_of_t, col1.x_of_t, col2.x_of_t) + // where col_i of t = row_i of m + // row0 of m = (1,4,7,10) → col0 of t = vec_4(1,4,7,10) + // row1 of m = (2,5,8,11) → col1 of t + // row2 of m = (3,6,9,12) → col2 of t + // So t.get_row0() = (col0.x, col1.x, col2.x) = (1, 2, 3) + ASSERT_VEC3_EQ(t.get_row0(), 1.0f, 2.0f, 3.0f); + ASSERT_VEC3_EQ(t.get_row1(), 4.0f, 5.0f, 6.0f); + ASSERT_VEC3_EQ(t.get_row2(), 7.0f, 8.0f, 9.0f); + ASSERT_VEC3_EQ(t.get_row3(), 10.0f, 11.0f, 12.0f); + return 0; +} + +// ---- mat_44 rotation ---- + +// ---- mat_43 trans_mult tests ---- + +int test_mat43_trans_mult_vec4() +{ + // mat_43 with cols that have non-zero .w component + mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 2.0f), + vec_4(0.0f, 1.0f, 0.0f, 3.0f), + vec_4(0.0f, 0.0f, 1.0f, 4.0f)); + vec_4 v(1.0f, 1.0f, 1.0f, 1.0f); + vec_3 r = m.trans_mult(v); + // result.i = dot4(col_i, v) + // result.x = 1+0+0+2 = 3 + // result.y = 0+1+0+3 = 4 + // result.z = 0+0+1+4 = 5 + ASSERT_VEC3_EQ(r, 3.0f, 4.0f, 5.0f); + return 0; +} + +int test_mat43_trans_mult_vector_t() +{ + // Identity-like mat_43 (col.w = 0) + mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 1.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 1.0f, 0.0f)); + vector_t v(2.0f, 3.0f, 4.0f); + vec_3 r = m.trans_mult(v); + // result.i = dot3(col_i.xyz, v.xyz) + ASSERT_VEC3_EQ(r, 2.0f, 3.0f, 4.0f); + return 0; +} + +int test_mat43_trans_mult_point_t() +{ + // mat_43 with translation in .w + mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 5.0f), + vec_4(0.0f, 1.0f, 0.0f, 6.0f), + vec_4(0.0f, 0.0f, 1.0f, 7.0f)); + point_t p(1.0f, 2.0f, 3.0f); + vec_3 r = m.trans_mult(p); + // result.i = dot3(col_i.xyz, p.xyz) + col_i.w + // result.x = 1 + 5 = 6 + // result.y = 2 + 6 = 8 + // result.z = 3 + 7 = 10 + ASSERT_VEC3_EQ(r, 6.0f, 8.0f, 10.0f); + return 0; +} + +// ---- mat_34 trans_mult ---- + +int test_mat34_trans_mult_vec3() +{ + // mat_34: identity 3x3 block + col3=(5,6,7) + mat_34 m(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(5.0f, 6.0f, 7.0f)); + vec_3 v(1.0f, 2.0f, 3.0f); + vec_4 r = m.trans_mult(v); + // result.i = dot3(col_i, v) + // result.x = 1, result.y = 2, result.z = 3, result.w = 5+12+21 = 38 + ASSERT_VEC_EQ(r, 1.0f, 2.0f, 3.0f, 38.0f); + return 0; +} + +// ---- row-vector * matrix tests ---- + +int test_vec3_multiply_mat33() +{ + // mat_33 that swaps x and z components + mat_33 m(vec_3(0.0f, 0.0f, 1.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(1.0f, 0.0f, 0.0f)); + vec_xyz v(1.0f, 2.0f, 3.0f); + vec_xyz r = v * m; + // result.i = dot3(v, col_i) + // result.x = dot3((1,2,3),(0,0,1)) = 3 + // result.y = dot3((1,2,3),(0,1,0)) = 2 + // result.z = dot3((1,2,3),(1,0,0)) = 1 + ASSERT_VEC3_EQ(r, 3.0f, 2.0f, 1.0f); + return 0; +} + +int test_vec3_multiply_mat34() +{ + // mat_34 identity block + col3=(4,5,6) + mat_34 m(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(4.0f, 5.0f, 6.0f)); + vec_xyz v(1.0f, 2.0f, 3.0f); + vec_xyzw r = v * m; + // result.i = dot3(v, col_i) + // result.x = 1, result.y = 2, result.z = 3, result.w = 4+10+18 = 32 + ASSERT_VEC_EQ(r, 1.0f, 2.0f, 3.0f, 32.0f); + return 0; +} + +int test_vec4_multiply_mat43() +{ + // Identity mat_43 + mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 1.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 1.0f, 0.0f)); + vec_xyzw v(2.0f, 3.0f, 4.0f, 5.0f); + vec_xyz r = v * m; + // result.i = dot4(v, col_i); w component of identity cols is 0 + ASSERT_VEC3_EQ(r, 2.0f, 3.0f, 4.0f); + return 0; +} + +int test_vec4_multiply_mat44() +{ + // Diagonal scale mat_44 (2,3,4,1) + mat_44 m(vec_4(2.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 3.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 4.0f, 0.0f), + vec_4(0.0f, 0.0f, 0.0f, 1.0f)); + vec_xyzw v(1.0f, 1.0f, 1.0f, 0.0f); + vec_xyzw r = v * m; + ASSERT_VEC_EQ(r, 2.0f, 3.0f, 4.0f, 0.0f); + return 0; +} + +// ---- mat_44 trans_mult tests ---- + +int test_mat44_trans_mult_vec4() +{ + // Diagonal scale mat_44 (2,3,4,1) + mat_44 m(vec_4(2.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 3.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 4.0f, 0.0f), + vec_4(0.0f, 0.0f, 0.0f, 1.0f)); + vec_4 v(1.0f, 2.0f, 3.0f, 4.0f); + vec_4 r = m.trans_mult(v); + // result.i = dot4(col_i, v) + // result.x = 2*1 = 2, result.y = 3*2 = 6, result.z = 4*3 = 12, result.w = 1*4 = 4 + ASSERT_VEC_EQ(r, 2.0f, 6.0f, 12.0f, 4.0f); + return 0; +} + +int test_mat44_trans_mult_vector_t() +{ + // Diagonal scale + col3=(10,20,30,1) + mat_44 m(vec_4(2.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 3.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 4.0f, 0.0f), + vec_4(10.0f, 20.0f, 30.0f, 1.0f)); + vector_t v(1.0f, 1.0f, 1.0f); + vec_4 r = m.trans_mult(v); + // result.i = dot3(col_i.xyz, v.xyz) + // result.x = 2, result.y = 3, result.z = 4, result.w = 10+20+30 = 60 + ASSERT_VEC_EQ(r, 2.0f, 3.0f, 4.0f, 60.0f); + return 0; +} + +int test_mat44_trans_mult_point_t() +{ + // Identity + col.w translation (5,6,7,0) + mat_44 m(vec_4(1.0f, 0.0f, 0.0f, 5.0f), + vec_4(0.0f, 1.0f, 0.0f, 6.0f), + vec_4(0.0f, 0.0f, 1.0f, 7.0f), + vec_4(0.0f, 0.0f, 0.0f, 1.0f)); + point_t p(1.0f, 2.0f, 3.0f); + vec_4 r = m.trans_mult(p); + // result.i = dot4(col_i, p) where p.w=1 + // result.x = 1+5 = 6, result.y = 2+6 = 8, result.z = 3+7 = 10, result.w = 1 + ASSERT_VEC_EQ(r, 6.0f, 8.0f, 10.0f, 1.0f); + return 0; +} + +int test_mat44_set_rotate_x() +{ + // 90-degree rotation around X: y→z, z→-y + mat_44 M; + M.set_rotate_x(3.14159265358979f / 2.0f); + // M * (1,0,0,0) = (1,0,0,0) (x-axis unchanged) + vec_4 ax(1.0f, 0.0f, 0.0f, 0.0f); + vec_4 rx = M * ax; + ASSERT_VEC_EQ(rx, 1.0f, 0.0f, 0.0f, 0.0f); + // M * (0,1,0,0) = (0,0,1,0) (y→z) + vec_4 ay(0.0f, 1.0f, 0.0f, 0.0f); + vec_4 ry = M * ay; + ASSERT_VEC_EQ(ry, 0.0f, 0.0f, 1.0f, 0.0f); + return 0; +} + +int test_mat44_set_rotate_z() +{ + // 90-degree rotation around Z: x→y, y→-x + mat_44 M; + M.set_rotate_z(3.14159265358979f / 2.0f); + // M * (1,0,0,0) = (0,1,0,0) (x→y) + vec_4 ax(1.0f, 0.0f, 0.0f, 0.0f); + vec_4 rx = M * ax; + ASSERT_VEC_EQ(rx, 0.0f, 1.0f, 0.0f, 0.0f); + return 0; +} + +// ---- mat_33 trans_mult(vec_3) ---- + +int test_mat33_trans_mult_vec3() +{ + // M^T * v = (dot(col0,v), dot(col1,v), dot(col2,v)) + // M: diagonal scale col0=(1,0,0), col1=(0,2,0), col2=(0,0,3) + mat_33 M; + M.col0.set(1.0f, 0.0f, 0.0f); + M.col1.set(0.0f, 2.0f, 0.0f); + M.col2.set(0.0f, 0.0f, 3.0f); + vec_3 v(1.0f, 1.0f, 1.0f); + vec_3 r = M.trans_mult(v); + ASSERT_VEC3_EQ(r, 1.0f, 2.0f, 3.0f); + return 0; +} + +// ---- mat_33 rotation tests ---- + +int test_mat33_set_rotate_x() +{ + // 90-degree rotation around X: y→z, z→-y + mat_33 M; + M.set_rotate_x(3.14159265358979f / 2.0f); + vec_3 ry = M * vec_3(0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(ry, 0.0f, 0.0f, 1.0f); + vec_3 rz = M * vec_3(0.0f, 0.0f, 1.0f); + ASSERT_VEC3_EQ(rz, 0.0f, -1.0f, 0.0f); + return 0; +} + +int test_mat33_set_rotate_y() +{ + // 90-degree rotation around Y: x→-z, z→x + mat_33 M; + M.set_rotate_y(3.14159265358979f / 2.0f); + vec_3 rx = M * vec_3(1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(rx, 0.0f, 0.0f, -1.0f); + vec_3 rz = M * vec_3(0.0f, 0.0f, 1.0f); + ASSERT_VEC3_EQ(rz, 1.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat33_set_rotate_z() +{ + // 90-degree rotation around Z: x→y, y→-x + mat_33 M; + M.set_rotate_z(3.14159265358979f / 2.0f); + vec_3 rx = M * vec_3(1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(rx, 0.0f, 1.0f, 0.0f); + vec_3 ry = M * vec_3(0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(ry, -1.0f, 0.0f, 0.0f); + return 0; +} + +// ---- mat_33 add/subtract ---- + +int test_mat33_add() +{ + mat_33 A, B; + A.col0.set(1.0f, 2.0f, 3.0f); + A.col1.set(4.0f, 5.0f, 6.0f); + A.col2.set(7.0f, 8.0f, 9.0f); + B.col0.set(9.0f, 8.0f, 7.0f); + B.col1.set(6.0f, 5.0f, 4.0f); + B.col2.set(3.0f, 2.0f, 1.0f); + mat_33 C = A + B; + ASSERT_VEC3_EQ(C.get_row0(), 10.0f, 10.0f, 10.0f); + ASSERT_VEC3_EQ(C.get_row1(), 10.0f, 10.0f, 10.0f); + ASSERT_VEC3_EQ(C.get_row2(), 10.0f, 10.0f, 10.0f); + return 0; +} + +int test_mat33_subtract() +{ + mat_33 A, B; + A.col0.set(5.0f, 5.0f, 5.0f); + A.col1.set(5.0f, 5.0f, 5.0f); + A.col2.set(5.0f, 5.0f, 5.0f); + B.col0.set(1.0f, 2.0f, 3.0f); + B.col1.set(4.0f, 5.0f, 6.0f); + B.col2.set(7.0f, 8.0f, 9.0f); + mat_33 C = A - B; + // C.col0=(4,3,2), col1=(1,0,-1), col2=(-2,-3,-4) + ASSERT_VEC3_EQ(C.get_row0(), 4.0f, 1.0f, -2.0f); + ASSERT_VEC3_EQ(C.get_row1(), 3.0f, 0.0f, -3.0f); + ASSERT_VEC3_EQ(C.get_row2(), 2.0f, -1.0f, -4.0f); + return 0; +} + +// ---- mat_33 trans_mult(mat_33) ---- + +int test_mat33_trans_mult_mat33() +{ + // M.trans_mult(A) = M^T * A; for M=diag(2,3,4), result col_i = (2*A.col_i.x, 3*A.col_i.y, 4*A.col_i.z) + mat_33 M; + M.col0.set(2.0f, 0.0f, 0.0f); + M.col1.set(0.0f, 3.0f, 0.0f); + M.col2.set(0.0f, 0.0f, 4.0f); + mat_33 I; + I.set_identity(); + mat_33 R = M.trans_mult(I); + // R.col_i = trans_mult(e_i): col0=(2,0,0), col1=(0,3,0), col2=(0,0,4) + ASSERT_VEC3_EQ(R.get_row0(), 2.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(R.get_row1(), 0.0f, 3.0f, 0.0f); + ASSERT_VEC3_EQ(R.get_row2(), 0.0f, 0.0f, 4.0f); + return 0; +} + +// ---- mat_33 from quaternion ---- + +int test_mat33_from_quat() +{ + // Identity quaternion (0,0,0,1) → identity matrix + vec_4 q_id(0.0f, 0.0f, 0.0f, 1.0f); + mat_33 M_id(q_id); + ASSERT_VEC3_EQ(M_id.get_row0(), 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(M_id.get_row1(), 0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(M_id.get_row2(), 0.0f, 0.0f, 1.0f); + // Quaternion for 90° around Z: (0, 0, sin45, cos45) → rotate x→y, y→-x + float s = sinf(3.14159265358979f / 4.0f); + float c = cosf(3.14159265358979f / 4.0f); + vec_4 q_z(0.0f, 0.0f, s, c); + mat_33 M_z(q_z); + // row0 = (0,-1,0), row1 = (1,0,0), row2 = (0,0,1) + ASSERT_VEC3_EQ(M_z.get_row0(), 0.0f, -1.0f, 0.0f); + ASSERT_VEC3_EQ(M_z.get_row1(), 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(M_z.get_row2(), 0.0f, 0.0f, 1.0f); + return 0; +} + +// ---- mat_33 mult_tilde ---- + +int test_mat33_mult_tilde() +{ + // identity.mult_tilde(v) = skew(v) + // skew((1,2,3)).col0=(0,3,-2), col1=(-3,0,1), col2=(2,-1,0) + mat_33 I; + I.set_identity(); + vec_3 v(1.0f, 2.0f, 3.0f); + mat_33 R = I.mult_tilde(v); + // row0 = (0,-3,2), row1 = (3,0,-1), row2 = (-2,1,0) + ASSERT_VEC3_EQ(R.get_row0(), 0.0f, -3.0f, 2.0f); + ASSERT_VEC3_EQ(R.get_row1(), 3.0f, 0.0f, -1.0f); + ASSERT_VEC3_EQ(R.get_row2(), -2.0f, 1.0f, 0.0f); + return 0; +} + +// ---- mat_44 rotation (y) ---- + +int test_mat44_set_rotate_y() +{ + // 90-degree rotation around Y: x→-z, z→x + mat_44 M; + M.set_rotate_y(3.14159265358979f / 2.0f); + vec_4 ax(1.0f, 0.0f, 0.0f, 0.0f); + vec_4 rx = M * ax; + ASSERT_VEC_EQ(rx, 0.0f, 0.0f, -1.0f, 0.0f); + vec_4 az(0.0f, 0.0f, 1.0f, 0.0f); + vec_4 rz = M * az; + ASSERT_VEC_EQ(rz, 1.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +// ---- mat_44 add/subtract ---- + +int test_mat44_add() +{ + mat_44 A, B; + A.set_identity(); + B.set_identity(); + mat_44 C = A + B; + // C = 2*I + ASSERT_VEC_EQ(C.get_row0(), 2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(C.get_row1(), 0.0f, 2.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(C.get_row2(), 0.0f, 0.0f, 2.0f, 0.0f); + ASSERT_VEC_EQ(C.get_row3(), 0.0f, 0.0f, 0.0f, 2.0f); + return 0; +} + +int test_mat44_subtract() +{ + mat_44 A; + A.set_identity(); + mat_44 B; + B.col0.set(1.0f, 0.0f, 0.0f, 0.0f); + B.col1.set(0.0f, 2.0f, 0.0f, 0.0f); + B.col2.set(0.0f, 0.0f, 3.0f, 0.0f); + B.col3.set(0.0f, 0.0f, 0.0f, 4.0f); + mat_44 C = A - B; + // I - diag(1,2,3,4) = diag(0,-1,-2,-3) + ASSERT_VEC_EQ(C.get_row0(), 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(C.get_row1(), 0.0f, -1.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(C.get_row2(), 0.0f, 0.0f, -2.0f, 0.0f); + ASSERT_VEC_EQ(C.get_row3(), 0.0f, 0.0f, 0.0f, -3.0f); + return 0; +} + +// ---- mat_44 trans_mult(mat_44) ---- + +int test_mat44_trans_mult_mat44() +{ + // diag(2,3,4,1).trans_mult(I) = diag(2,3,4,1) + mat_44 M; + M.col0.set(2.0f, 0.0f, 0.0f, 0.0f); + M.col1.set(0.0f, 3.0f, 0.0f, 0.0f); + M.col2.set(0.0f, 0.0f, 4.0f, 0.0f); + M.col3.set(0.0f, 0.0f, 0.0f, 1.0f); + mat_44 I; + I.set_identity(); + mat_44 R = M.trans_mult(I); + ASSERT_VEC_EQ(R.get_row0(), 2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(R.get_row1(), 0.0f, 3.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(R.get_row2(), 0.0f, 0.0f, 4.0f, 0.0f); + ASSERT_VEC_EQ(R.get_row3(), 0.0f, 0.0f, 0.0f, 1.0f); + return 0; +} + +// ---- mat_44 * mat_43 ---- + +int test_mat44_multiply_mat43() +{ + // I44 * identity_43 = identity_43 + mat_44 I44; + I44.set_identity(); + mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 1.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 1.0f, 0.0f)); + mat_43 R = I44 * I43; + vec_xyzw c0 = R.get_col0(); + vec_xyzw c1 = R.get_col1(); + vec_xyzw c2 = R.get_col2(); + ASSERT_VEC_EQ(c0, 1.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(c1, 0.0f, 1.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(c2, 0.0f, 0.0f, 1.0f, 0.0f); + return 0; +} + +// ---- mat_43 negate ---- + +int test_mat43_negate() +{ + mat_43 M(vec_4(1.0f, 2.0f, 3.0f, 4.0f), + vec_4(5.0f, 6.0f, 7.0f, 8.0f), + vec_4(9.0f, 10.0f, 11.0f, 12.0f)); + mat_43 N = -M; + vec_xyzw c0 = N.get_col0(); + vec_xyzw c1 = N.get_col1(); + vec_xyzw c2 = N.get_col2(); + ASSERT_VEC_EQ(c0, -1.0f, -2.0f, -3.0f, -4.0f); + ASSERT_VEC_EQ(c1, -5.0f, -6.0f, -7.0f, -8.0f); + ASSERT_VEC_EQ(c2, -9.0f, -10.0f, -11.0f, -12.0f); + return 0; +} + +// ---- mat_43 add/subtract ---- + +int test_mat43_add() +{ + mat_43 A(vec_4(1.0f, 2.0f, 3.0f, 4.0f), + vec_4(5.0f, 6.0f, 7.0f, 8.0f), + vec_4(9.0f, 10.0f, 11.0f, 12.0f)); + mat_43 B(vec_4(-1.0f, -2.0f, -3.0f, -4.0f), + vec_4(-5.0f, -6.0f, -7.0f, -8.0f), + vec_4(-9.0f, -10.0f, -11.0f, -12.0f)); + mat_43 C = A + B; + vec_xyzw c0 = C.get_col0(); + vec_xyzw c1 = C.get_col1(); + vec_xyzw c2 = C.get_col2(); + ASSERT_VEC_EQ(c0, 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(c1, 0.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(c2, 0.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_mat43_subtract() +{ + mat_43 A(vec_4(5.0f, 5.0f, 5.0f, 5.0f), + vec_4(5.0f, 5.0f, 5.0f, 5.0f), + vec_4(5.0f, 5.0f, 5.0f, 5.0f)); + mat_43 B(vec_4(1.0f, 2.0f, 3.0f, 4.0f), + vec_4(1.0f, 2.0f, 3.0f, 4.0f), + vec_4(1.0f, 2.0f, 3.0f, 4.0f)); + mat_43 C = A - B; + vec_xyzw c0 = C.get_col0(); + ASSERT_VEC_EQ(c0, 4.0f, 3.0f, 2.0f, 1.0f); + return 0; +} + +// ---- mat_43 * mat_33 ---- + +int test_mat43_multiply_mat33() +{ + // identity_43 * identity_33 = identity_43 + mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 1.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 1.0f, 0.0f)); + mat_33 I33; + I33.set_identity(); + mat_43 R = I43 * I33; + vec_xyzw c0 = R.get_col0(); + vec_xyzw c1 = R.get_col1(); + vec_xyzw c2 = R.get_col2(); + ASSERT_VEC_EQ(c0, 1.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(c1, 0.0f, 1.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(c2, 0.0f, 0.0f, 1.0f, 0.0f); + return 0; +} + +// ---- mat_43 mult_tilde ---- + +int test_mat43_mult_tilde() +{ + // identity_43.mult_tilde(v) columns = skew(v) padded to vec_4 + mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 1.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 1.0f, 0.0f)); + vec_3 v(1.0f, 2.0f, 3.0f); + mat_43 R = I43.mult_tilde(v); + // col0=(0,3,-2,0), col1=(-3,0,1,0), col2=(2,-1,0,0) + vec_xyzw c0 = R.get_col0(); + vec_xyzw c1 = R.get_col1(); + vec_xyzw c2 = R.get_col2(); + ASSERT_VEC_EQ(c0, 0.0f, 3.0f, -2.0f, 0.0f); + ASSERT_VEC_EQ(c1, -3.0f, 0.0f, 1.0f, 0.0f); + ASSERT_VEC_EQ(c2, 2.0f, -1.0f, 0.0f, 0.0f); + return 0; +} + +// ---- mat_34 negate ---- + +int test_mat34_negate() +{ + mat_34 M(vec_3(1.0f, 2.0f, 3.0f), + vec_3(4.0f, 5.0f, 6.0f), + vec_3(7.0f, 8.0f, 9.0f), + vec_3(10.0f, 11.0f, 12.0f)); + mat_34 N = -M; + ASSERT_VEC3_EQ(N.get_row0(), -1.0f, -4.0f, -7.0f); + ASSERT_VEC3_EQ(N.get_row1(), -2.0f, -5.0f, -8.0f); + ASSERT_VEC3_EQ(N.get_row2(), -3.0f, -6.0f, -9.0f); + return 0; +} + +// ---- mat_34 add/subtract ---- + +int test_mat34_add() +{ + mat_34 A(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(0.0f, 0.0f, 0.0f)); + mat_34 B(vec_3(0.0f, 0.0f, 0.0f), + vec_3(0.0f, 0.0f, 0.0f), + vec_3(0.0f, 0.0f, 0.0f), + vec_3(4.0f, 5.0f, 6.0f)); + mat_34 C = A + B; + ASSERT_VEC3_EQ(C.get_row0(), 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(C.get_row1(), 0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(C.get_row2(), 0.0f, 0.0f, 1.0f); + return 0; +} + +int test_mat34_subtract() +{ + mat_34 A(vec_3(2.0f, 0.0f, 0.0f), + vec_3(0.0f, 2.0f, 0.0f), + vec_3(0.0f, 0.0f, 2.0f), + vec_3(0.0f, 0.0f, 0.0f)); + mat_34 B(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(0.0f, 0.0f, 0.0f)); + mat_34 C = A - B; + ASSERT_VEC3_EQ(C.get_row0(), 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(C.get_row1(), 0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(C.get_row2(), 0.0f, 0.0f, 1.0f); + return 0; +} + +// ---- mat_34 * mat_43 ---- + +int test_mat34_multiply_mat43() +{ + // I34 (with col3=0) * I43 = I33 + mat_34 I34(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(0.0f, 0.0f, 0.0f)); + mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f), + vec_4(0.0f, 1.0f, 0.0f, 0.0f), + vec_4(0.0f, 0.0f, 1.0f, 0.0f)); + mat_33 R = I34 * I43; + ASSERT_VEC3_EQ(R.get_row0(), 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(R.get_row1(), 0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(R.get_row2(), 0.0f, 0.0f, 1.0f); + return 0; +} + +// ---- mat_34 * mat_44 ---- + +int test_mat34_multiply_mat44() +{ + // I34 * I44 = I34 + mat_34 I34(vec_3(1.0f, 0.0f, 0.0f), + vec_3(0.0f, 1.0f, 0.0f), + vec_3(0.0f, 0.0f, 1.0f), + vec_3(0.0f, 0.0f, 0.0f)); + mat_44 I44; + I44.set_identity(); + mat_34 R = I34 * I44; + ASSERT_VEC3_EQ(R.get_row0(), 1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(R.get_row1(), 0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(R.get_row2(), 0.0f, 0.0f, 1.0f); + return 0; +} + +// ---- mat_33::inverse ---- + +int test_mat33_inverse() +{ + // Diagonal matrix diag(2,3,4): inverse = diag(0.5, 1/3, 0.25) + mat_33 M; + M.col0.set(2.0f, 0.0f, 0.0f); + M.col1.set(0.0f, 3.0f, 0.0f); + M.col2.set(0.0f, 0.0f, 4.0f); + mat_33 inv = M.inverse(); + ASSERT_VEC3_EQ(inv.get_row0(), 0.5f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(inv.get_row1(), 0.0f, 1.0f/3.0f, 0.0f); + ASSERT_VEC3_EQ(inv.get_row2(), 0.0f, 0.0f, 0.25f); + return 0; +} + +// ---- vec_xyz::operator~() tilde matrix ---- + +int test_vec_xyz_tilde() +{ + // ~(1,2,3) = skew-symmetric matrix + // col0=(0,3,-2), col1=(-3,0,1), col2=(2,-1,0) + vec_3 v(1.0f, 2.0f, 3.0f); + mat_33 T = ~v; + ASSERT_VEC3_EQ(T.get_row0(), 0.0f, -3.0f, 2.0f); + ASSERT_VEC3_EQ(T.get_row1(), 3.0f, 0.0f, -1.0f); + ASSERT_VEC3_EQ(T.get_row2(), -2.0f, 1.0f, 0.0f); + return 0; +} + +// Helper: 90-degree Z rotation + translation(4,5,6) +static void make_test_transform(transform_t& T) +{ + // Write Rz(90 deg) + translation(4,5,6) using individual float stores + // to avoid R5900 GCC TI-mode register-pair bug (stale upper register with sd) + float *p = reinterpret_cast(&T); + // col0 = (0, 1, 0, 0) + p[0] = 0.0f; p[1] = 1.0f; p[2] = 0.0f; p[3] = 0.0f; + // col1 = (-1, 0, 0, 0) + p[4] = -1.0f; p[5] = 0.0f; p[6] = 0.0f; p[7] = 0.0f; + // col2 = (0, 0, 1, 0) + p[8] = 0.0f; p[9] = 0.0f; p[10] = 1.0f; p[11] = 0.0f; + // col3 = (4, 5, 6, 1) + p[12] = 4.0f; p[13] = 5.0f; p[14] = 6.0f; p[15] = 1.0f; +} + +// ---- transform_t::operator*(vec_4) ---- + +int test_transform_mult_vec4() +{ + transform_t T; + make_test_transform(T); + // T * (1,2,3,1): R*(1,2,3) + t = (0+(-2)+0+4, 1+0+0+5, 0+0+3+6, 1) = (2,6,9,1) + vec_4 r = T * vec_4(1.0f, 2.0f, 3.0f, 1.0f); + ASSERT_VEC_EQ(r, 2.0f, 6.0f, 9.0f, 1.0f); + // T * (1,0,0,0): direction, translation not applied + vec_4 r2 = T * vec_4(1.0f, 0.0f, 0.0f, 0.0f); + ASSERT_VEC_EQ(r2, 0.0f, 1.0f, 0.0f, 0.0f); + return 0; +} + +// ---- transform_t::operator*(vector_t) ---- + +int test_transform_mult_vector() +{ + transform_t T; + make_test_transform(T); + // T * (1,0,0) = col0 = (0,1,0) + vector_t r = T * vector_t(1.0f, 0.0f, 0.0f); + ASSERT_VEC3_EQ(r, 0.0f, 1.0f, 0.0f); + // T * (0,1,0) = col1 = (-1,0,0) + vector_t r2 = T * vector_t(0.0f, 1.0f, 0.0f); + ASSERT_VEC3_EQ(r2, -1.0f, 0.0f, 0.0f); + return 0; +} + +// ---- transform_t::operator*(point_t) ---- + +int test_transform_mult_point() +{ + transform_t T; + make_test_transform(T); + // T * (1,2,3) = R*(1,2,3) + t = (2,6,9) + point_t r = T * point_t(1.0f, 2.0f, 3.0f); + ASSERT_VEC3_EQ(r, 2.0f, 6.0f, 9.0f); + return 0; +} + +// ---- transform_t::inverse() ---- + +int test_transform_inverse() +{ + transform_t T; + make_test_transform(T); + transform_t inv = T.inverse(); + // R_z(90)^{-1} = R_z(-90): col0=(0,-1,0), col1=(1,0,0), col2=(0,0,1) + // -R^T * t = -(0*4+1*5+0*6, -1*4+0*5+0*6, 0*4+0*5+1*6) = (-5, 4, -6) + // get_rowN = (colN.x, colN.y, colN.z) packed, last = col3.N + ASSERT_VEC_EQ(inv.get_row0(), 0.0f, 1.0f, 0.0f, -5.0f); + ASSERT_VEC_EQ(inv.get_row1(), -1.0f, 0.0f, 0.0f, 4.0f); + ASSERT_VEC_EQ(inv.get_row2(), 0.0f, 0.0f, 1.0f, -6.0f); + return 0; +} + +// ---- transform_t::orthonormal_inverse() ---- + +int test_transform_orthonormal_inverse() +{ + transform_t T; + make_test_transform(T); + transform_t inv = T.orthonormal_inverse(); + ASSERT_VEC_EQ(inv.get_row0(), 0.0f, 1.0f, 0.0f, -5.0f); + ASSERT_VEC_EQ(inv.get_row1(), -1.0f, 0.0f, 0.0f, 4.0f); + ASSERT_VEC_EQ(inv.get_row2(), 0.0f, 0.0f, 1.0f, -6.0f); + return 0; +} + +// ---- transform_t::orthonormal_inverse_in_place() ---- + +int test_transform_orthonormal_inverse_in_place() +{ + transform_t T; + make_test_transform(T); + T.orthonormal_inverse_in_place(); + ASSERT_VEC_EQ(T.get_row0(), 0.0f, 1.0f, 0.0f, -5.0f); + ASSERT_VEC_EQ(T.get_row1(), -1.0f, 0.0f, 0.0f, 4.0f); + ASSERT_VEC_EQ(T.get_row2(), 0.0f, 0.0f, 1.0f, -6.0f); + return 0; +} + +#ifndef USE_CPU_COMPAT +// ============================================================================ +// VU0 asm fast-path tests for cpu_vec_4 and cpu_mat_44 operators. +// These exercise the new VU0 macro-mode asm blocks added to cpu_vector.h / +// cpu_matrix.h that are gated on !NO_VU0_VECTORS. In the USE_CPU_COMPAT build +// the same operators take their scalar fallback path and are covered by the +// legacy CPU tests below. +// ============================================================================ + +int test_cpu_vec4_vu0_addition() { + cpu_vec_4 a(1.0f, 2.0f, 3.0f, 4.0f), b(5.0f, 6.0f, 7.0f, 8.0f); + cpu_vec_4 r = a + b; + ASSERT_CPU_VEC4_EQ(r, 6.0f, 8.0f, 10.0f, 12.0f); + return 0; +} + +int test_cpu_vec4_vu0_addition_signed() { + cpu_vec_4 a(-1.5f, 2.5f, -3.25f, 4.75f), b(1.5f, -2.5f, 3.25f, -4.75f); + cpu_vec_4 r = a + b; + ASSERT_CPU_VEC4_EQ(r, 0.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_cpu_vec4_vu0_subtraction() { + cpu_vec_4 a(10.0f, 20.0f, 30.0f, 40.0f), b(1.0f, 2.0f, 3.0f, 4.0f); + cpu_vec_4 r = a - b; + ASSERT_CPU_VEC4_EQ(r, 9.0f, 18.0f, 27.0f, 36.0f); + return 0; +} + +int test_cpu_vec4_vu0_negation() { + cpu_vec_4 v(1.0f, -2.0f, 3.0f, -4.0f); + cpu_vec_4 r = -v; + ASSERT_CPU_VEC4_EQ(r, -1.0f, 2.0f, -3.0f, 4.0f); + return 0; +} + +int test_cpu_vec4_vu0_negation_zero() { + cpu_vec_4 v(0.0f, 0.0f, 0.0f, 0.0f); + cpu_vec_4 r = -v; + ASSERT_CPU_VEC4_EQ(r, 0.0f, 0.0f, 0.0f, 0.0f); + return 0; +} + +int test_cpu_vec4_vu0_component_mul() { + cpu_vec_4 a(2.0f, 3.0f, 4.0f, 5.0f), b(3.0f, 4.0f, 5.0f, 6.0f); + cpu_vec_4 r = a * b; + ASSERT_CPU_VEC4_EQ(r, 6.0f, 12.0f, 20.0f, 30.0f); + return 0; +} + +int test_cpu_vec4_vu0_component_mul_signed() { + cpu_vec_4 a(-1.0f, 2.0f, -3.0f, 4.0f), b(2.0f, -3.0f, 4.0f, -5.0f); + cpu_vec_4 r = a * b; + ASSERT_CPU_VEC4_EQ(r, -2.0f, -6.0f, -12.0f, -20.0f); + return 0; +} + +int test_cpu_mat44_vu0_mult_vector_identity() { + cpu_mat_44 m; m.set_identity(); + cpu_vec_4 v(1.5f, -2.5f, 3.5f, -4.5f); + cpu_vec_4 r = m * v; + ASSERT_CPU_VEC4_EQ(r, 1.5f, -2.5f, 3.5f, -4.5f); + return 0; +} + +int test_cpu_mat44_vu0_mult_vector_diag() { + cpu_mat_44 m(cpu_vec_4(2.0f, 0.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 3.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 0.0f, 4.0f, 0.0f), + cpu_vec_4(0.0f, 0.0f, 0.0f, 5.0f)); + cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f); + cpu_vec_4 r = m * v; + ASSERT_CPU_VEC4_EQ(r, 2.0f, 3.0f, 4.0f, 5.0f); + return 0; +} + +int test_cpu_mat44_vu0_mult_vector_affine() { + // Column-major: cols are basis vectors, last col is translation. + cpu_mat_44 m(cpu_vec_4(1.0f, 0.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 2.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 0.0f, 3.0f, 0.0f), + cpu_vec_4(4.0f, 5.0f, 6.0f, 1.0f)); + cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f); + cpu_vec_4 r = m * v; + ASSERT_CPU_VEC4_EQ(r, 5.0f, 7.0f, 9.0f, 1.0f); + return 0; +} + +int test_cpu_mat44_vu0_mult_vector_general() { + cpu_mat_44 m(cpu_vec_4(1.0f, 2.0f, 3.0f, 4.0f), + cpu_vec_4(5.0f, 6.0f, 7.0f, 8.0f), + cpu_vec_4(9.0f, 10.0f, 11.0f, 12.0f), + cpu_vec_4(13.0f, 14.0f, 15.0f, 16.0f)); + cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f); + cpu_vec_4 r = m * v; + // Sum of columns + ASSERT_CPU_VEC4_EQ(r, 28.0f, 32.0f, 36.0f, 40.0f); + return 0; +} + +int test_cpu_mat44_vu0_mult_mat_identity() { + cpu_mat_44 a(cpu_vec_4(1.0f, 2.0f, 3.0f, 4.0f), + cpu_vec_4(5.0f, 6.0f, 7.0f, 8.0f), + cpu_vec_4(9.0f, 10.0f, 11.0f, 12.0f), + cpu_vec_4(13.0f, 14.0f, 15.0f, 16.0f)); + cpu_mat_44 i; i.set_identity(); + cpu_mat_44 r = a * i; + ASSERT_CPU_VEC4_EQ(r.get_col0(), 1.0f, 2.0f, 3.0f, 4.0f); + ASSERT_CPU_VEC4_EQ(r.get_col1(), 5.0f, 6.0f, 7.0f, 8.0f); + ASSERT_CPU_VEC4_EQ(r.get_col2(), 9.0f, 10.0f, 11.0f, 12.0f); + ASSERT_CPU_VEC4_EQ(r.get_col3(), 13.0f, 14.0f, 15.0f, 16.0f); + return 0; +} + +int test_cpu_mat44_vu0_mult_mat_scale_translate() { + cpu_mat_44 s; s.set_scale(cpu_vec_3(2.0f, 3.0f, 4.0f)); + cpu_mat_44 t; t.set_translate(cpu_vec_3(1.0f, 2.0f, 3.0f)); + cpu_mat_44 r = s * t; + // Scale then translate: col3 becomes (2,6,12,1); diag stays (2,3,4,1). + ASSERT_CPU_VEC4_EQ(r.get_col0(), 2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(r.get_col1(), 0.0f, 3.0f, 0.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(r.get_col2(), 0.0f, 0.0f, 4.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(r.get_col3(), 2.0f, 6.0f, 12.0f, 1.0f); + return 0; +} + +int test_cpu_mat44_vu0_mult_mat_general() { + // Verify a non-diagonal multiply against a hand-computed result. + // A column-major: cols [(1,0,0,0), (0,1,0,0), (1,1,1,0), (0,0,0,1)] + // B column-major: cols [(2,0,0,0), (0,2,0,0), (0,0,2,0), (1,1,1,1)] + // R = A * B (column-major): col_k = A * B.col_k + cpu_mat_44 A(cpu_vec_4(1.0f, 0.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 1.0f, 0.0f, 0.0f), + cpu_vec_4(1.0f, 1.0f, 1.0f, 0.0f), + cpu_vec_4(0.0f, 0.0f, 0.0f, 1.0f)); + cpu_mat_44 B(cpu_vec_4(2.0f, 0.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 2.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 0.0f, 2.0f, 0.0f), + cpu_vec_4(1.0f, 1.0f, 1.0f, 1.0f)); + cpu_mat_44 R = A * B; + ASSERT_CPU_VEC4_EQ(R.get_col0(), 2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(R.get_col1(), 0.0f, 2.0f, 0.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(R.get_col2(), 2.0f, 2.0f, 2.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(R.get_col3(), 2.0f, 2.0f, 1.0f, 1.0f); + return 0; +} +#endif // !USE_CPU_COMPAT + +#ifdef USE_CPU_COMPAT +// ============================================================================ +// CPU legacy type tests (cpu_vec_3, cpu_vec_4, cpu_mat_44) +// Only compiled in the USE_CPU_COMPAT build. +// ============================================================================ +int test_cpu_vec3_construction() { + cpu_vec_3 v(1.0f, 2.0f, 3.0f); + ASSERT_CPU_VEC3_EQ(v, 1.0f, 2.0f, 3.0f); return 0; } + +int test_cpu_vec3_set() { + cpu_vec_3 v(0.0f, 0.0f, 0.0f); + v.set(4.0f, 5.0f, 6.0f); + ASSERT_CPU_VEC3_EQ(v, 4.0f, 5.0f, 6.0f); return 0; } + +int test_cpu_vec3_addition() { + cpu_vec_3 a(1.0f, 2.0f, 3.0f), b(4.0f, 5.0f, 6.0f); + cpu_vec_3 r = a + b; + ASSERT_CPU_VEC3_EQ(r, 5.0f, 7.0f, 9.0f); return 0; } + +int test_cpu_vec3_subtraction() { + cpu_vec_3 a(5.0f, 7.0f, 9.0f), b(1.0f, 2.0f, 3.0f); + cpu_vec_3 r = a - b; + ASSERT_CPU_VEC3_EQ(r, 4.0f, 5.0f, 6.0f); return 0; } + +int test_cpu_vec3_negation() { + cpu_vec_3 v(1.0f, -2.0f, 3.0f); + cpu_vec_3 r = -v; + ASSERT_CPU_VEC3_EQ(r, -1.0f, 2.0f, -3.0f); return 0; } + +int test_cpu_vec3_scalar_mul() { + cpu_vec_3 v(1.0f, 2.0f, 3.0f); + cpu_vec_3 r = v * 3.0f; + ASSERT_CPU_VEC3_EQ(r, 3.0f, 6.0f, 9.0f); return 0; } + +int test_cpu_vec3_dot() { + cpu_vec_3 a(1.0f, 2.0f, 3.0f), b(4.0f, 5.0f, 6.0f); + ASSERT_FLOAT_EQ(a.dot(b), 32.0f); return 0; } // 1*4+2*5+3*6=32 + +int test_cpu_vec3_cross() { + cpu_vec_3 a(1.0f, 0.0f, 0.0f), b(0.0f, 1.0f, 0.0f); + cpu_vec_3 r = a.cross(b); + ASSERT_CPU_VEC3_EQ(r, 0.0f, 0.0f, 1.0f); return 0; } + +int test_cpu_vec3_cross_general() { + cpu_vec_3 a(1.0f, 2.0f, 3.0f), b(4.0f, 5.0f, 6.0f); + cpu_vec_3 r = a.cross(b); + ASSERT_CPU_VEC3_EQ(r, -3.0f, 6.0f, -3.0f); return 0; } + +int test_cpu_vec3_normalized() { + cpu_vec_3 v(3.0f, 0.0f, 0.0f); + cpu_vec_3 r = v.normalized(); + ASSERT_CPU_VEC3_EQ(r, 1.0f, 0.0f, 0.0f); return 0; } + +int test_cpu_vec3_normalize() { + cpu_vec_3 v(0.0f, 0.0f, 5.0f); + v.normalize(); + ASSERT_CPU_VEC3_EQ(v, 0.0f, 0.0f, 1.0f); return 0; } + +int test_cpu_vec3_index_operator() { + cpu_vec_3 v(7.0f, 8.0f, 9.0f); + ASSERT_FLOAT_EQ(v(0), 7.0f); + ASSERT_FLOAT_EQ(v(1), 8.0f); + ASSERT_FLOAT_EQ(v(2), 9.0f); return 0; } + +int test_cpu_vec4_construction() { + cpu_vec_4 v(1.0f, 2.0f, 3.0f, 4.0f); + ASSERT_CPU_VEC4_EQ(v, 1.0f, 2.0f, 3.0f, 4.0f); return 0; } + +int test_cpu_vec4_set() { + cpu_vec_4 v(0.0f, 0.0f, 0.0f, 0.0f); + v.set(1.0f, 2.0f, 3.0f, 4.0f); + ASSERT_CPU_VEC4_EQ(v, 1.0f, 2.0f, 3.0f, 4.0f); return 0; } + +int test_cpu_vec4_addition() { + cpu_vec_4 a(1.0f, 2.0f, 3.0f, 4.0f), b(5.0f, 6.0f, 7.0f, 8.0f); + cpu_vec_4 r = a + b; + ASSERT_CPU_VEC4_EQ(r, 6.0f, 8.0f, 10.0f, 12.0f); return 0; } + +int test_cpu_vec4_subtraction() { + cpu_vec_4 a(5.0f, 6.0f, 7.0f, 8.0f), b(1.0f, 2.0f, 3.0f, 4.0f); + cpu_vec_4 r = a - b; + ASSERT_CPU_VEC4_EQ(r, 4.0f, 4.0f, 4.0f, 4.0f); return 0; } + +int test_cpu_vec4_negation() { + cpu_vec_4 v(1.0f, -2.0f, 3.0f, -4.0f); + cpu_vec_4 r = -v; + ASSERT_CPU_VEC4_EQ(r, -1.0f, 2.0f, -3.0f, 4.0f); return 0; } + +int test_cpu_vec4_component_mul() { + cpu_vec_4 a(2.0f, 3.0f, 4.0f, 5.0f), b(3.0f, 4.0f, 5.0f, 6.0f); + cpu_vec_4 r = a * b; + ASSERT_CPU_VEC4_EQ(r, 6.0f, 12.0f, 20.0f, 30.0f); return 0; } + +int test_cpu_vec4_scalar_mul() { + cpu_vec_4 v(1.0f, 2.0f, 3.0f, 4.0f); + cpu_vec_4 r = v * 2.0f; + ASSERT_CPU_VEC4_EQ(r, 2.0f, 4.0f, 6.0f, 8.0f); return 0; } + +int test_cpu_vec4_dot() { + cpu_vec_4 a(1.0f, 2.0f, 3.0f, 4.0f), b(5.0f, 6.0f, 7.0f, 8.0f); + ASSERT_FLOAT_EQ(a.dot(b), 70.0f); return 0; } // 1*5+2*6+3*7+4*8=70 + +int test_cpu_vec4_length() { + cpu_vec_4 v(2.0f, 0.0f, 0.0f, 0.0f); + ASSERT_FLOAT_EQ(v.length(), 2.0f); + cpu_vec_4 v2(1.0f, 1.0f, 1.0f, 1.0f); + ASSERT_FLOAT_EQ(v2.length(), 2.0f); return 0; } + +int test_cpu_vec4_normalized() { + cpu_vec_4 v(4.0f, 0.0f, 0.0f, 0.0f); + cpu_vec_4 r = v.normalized(); + ASSERT_CPU_VEC4_EQ(r, 1.0f, 0.0f, 0.0f, 0.0f); return 0; } + +int test_cpu_vec4_normalize() { + cpu_vec_4 v(0.0f, 0.0f, 3.0f, 0.0f); + v.normalize(); + ASSERT_CPU_VEC4_EQ(v, 0.0f, 0.0f, 1.0f, 0.0f); return 0; } + +int test_cpu_mat44_set_identity() { + cpu_mat_44 m; m.set_identity(); + ASSERT_CPU_VEC4_EQ(m.get_col0(), 1.0f, 0.0f, 0.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(m.get_col1(), 0.0f, 1.0f, 0.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(m.get_col2(), 0.0f, 0.0f, 1.0f, 0.0f); + ASSERT_CPU_VEC4_EQ(m.get_col3(), 0.0f, 0.0f, 0.0f, 1.0f); return 0; } + +int test_cpu_mat44_col_access() { + cpu_mat_44 m; + m.set_col0(cpu_vec_xyzw(1.0f, 2.0f, 3.0f, 4.0f)); + m.set_col1(cpu_vec_xyzw(5.0f, 6.0f, 7.0f, 8.0f)); + m.set_col2(cpu_vec_xyzw(9.0f, 10.0f, 11.0f, 12.0f)); + m.set_col3(cpu_vec_xyzw(13.0f, 14.0f, 15.0f, 16.0f)); + ASSERT_CPU_VEC4_EQ(m.get_col0(), 1.0f, 2.0f, 3.0f, 4.0f); + ASSERT_CPU_VEC4_EQ(m.get_col1(), 5.0f, 6.0f, 7.0f, 8.0f); + ASSERT_CPU_VEC4_EQ(m.get_col2(), 9.0f, 10.0f, 11.0f, 12.0f); + ASSERT_CPU_VEC4_EQ(m.get_col3(), 13.0f, 14.0f, 15.0f, 16.0f); return 0; } + +int test_cpu_mat44_mult_vector_identity() { + cpu_mat_44 m; m.set_identity(); + cpu_vec_4 v(1.0f, 2.0f, 3.0f, 4.0f); + cpu_vec_4 r = m * v; + ASSERT_CPU_VEC4_EQ(r, 1.0f, 2.0f, 3.0f, 4.0f); return 0; } + +int test_cpu_mat44_mult_vector() { + cpu_mat_44 m(cpu_vec_4(1.0f, 0.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 2.0f, 0.0f, 0.0f), + cpu_vec_4(0.0f, 0.0f, 3.0f, 0.0f), + cpu_vec_4(4.0f, 5.0f, 6.0f, 1.0f)); + cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f); + cpu_vec_4 r = m * v; + ASSERT_CPU_VEC4_EQ(r, 5.0f, 7.0f, 9.0f, 1.0f); return 0; } + +int test_cpu_mat44_set_scale() { + cpu_mat_44 m; m.set_scale(cpu_vec_3(2.0f, 3.0f, 4.0f)); + cpu_vec_4 r = m * cpu_vec_4(1.0f, 1.0f, 1.0f, 1.0f); + ASSERT_CPU_VEC4_EQ(r, 2.0f, 3.0f, 4.0f, 1.0f); return 0; } + +int test_cpu_mat44_set_translate() { + cpu_mat_44 m; m.set_translate(cpu_vec_3(5.0f, 6.0f, 7.0f)); + cpu_vec_4 r = m * cpu_vec_4(1.0f, 2.0f, 3.0f, 1.0f); + ASSERT_CPU_VEC4_EQ(r, 6.0f, 8.0f, 10.0f, 1.0f); return 0; } + +int test_cpu_mat44_set_rotate_z() { + cpu_mat_44 m; + m.set_rotate(3.14159265358979f / 2.0f, cpu_vec_3(0.0f, 0.0f, 1.0f)); + cpu_vec_4 r = m * cpu_vec_4(1.0f, 0.0f, 0.0f, 1.0f); + ASSERT_FLOAT_EQ(r.x, 0.0f); ASSERT_FLOAT_EQ(r.y, 1.0f); ASSERT_FLOAT_EQ(r.z, 0.0f); return 0; } + +int test_cpu_mat44_set_rotate_x() { + cpu_mat_44 m; + m.set_rotate(3.14159265358979f / 2.0f, cpu_vec_3(1.0f, 0.0f, 0.0f)); + cpu_vec_4 r = m * cpu_vec_4(0.0f, 1.0f, 0.0f, 1.0f); + ASSERT_FLOAT_EQ(r.x, 0.0f); ASSERT_FLOAT_EQ(r.y, 0.0f); ASSERT_FLOAT_EQ(r.z, 1.0f); return 0; } + +int test_cpu_mat44_transpose() { + cpu_mat_44 m(cpu_vec_4(1.0f, 2.0f, 3.0f, 4.0f), + cpu_vec_4(5.0f, 6.0f, 7.0f, 8.0f), + cpu_vec_4(9.0f, 10.0f, 11.0f, 12.0f), + cpu_vec_4(13.0f, 14.0f, 15.0f, 16.0f)); + cpu_mat_44 t = m.transpose(); + ASSERT_CPU_VEC4_EQ(t.get_col0(), 1.0f, 5.0f, 9.0f, 13.0f); + ASSERT_CPU_VEC4_EQ(t.get_col1(), 2.0f, 6.0f, 10.0f, 14.0f); + ASSERT_CPU_VEC4_EQ(t.get_col2(), 3.0f, 7.0f, 11.0f, 15.0f); + ASSERT_CPU_VEC4_EQ(t.get_col3(), 4.0f, 8.0f, 12.0f, 16.0f); return 0; } + +int test_cpu_mat44_multiply_mat() { + cpu_mat_44 a; a.set_scale(cpu_vec_3(2.0f, 3.0f, 4.0f)); + cpu_mat_44 b; b.set_translate(cpu_vec_3(1.0f, 2.0f, 3.0f)); + cpu_mat_44 r = a * b; + ASSERT_CPU_VEC4_EQ(r.get_col3(), 2.0f, 6.0f, 12.0f, 1.0f); + ASSERT_CPU_VEC4_EQ(r.get_col0(), 2.0f, 0.0f, 0.0f, 0.0f); return 0; } +#endif // USE_CPU_COMPAT + +int main() +{ + printf("=== VU0 Function Unit Tests ===\n\n"); + + // Detect emulator/hardware VU multiply behavior and report it up front. + // This helps identify float corner-case differences (1*X vs X*1) noted + // in https://fobes.dev/ps2/detecting-emu-vu-floats + g_vu_mul_emulator_bug = isVUMulErrorPresent(); + printf("VU mul 1*X behavior detection: %s\n\n", + g_vu_mul_emulator_bug ? "hardware-like (1*X != X)" : "emulator-like (1*X == X)"); + + int failed = 0; + + // Vector 3D tests + RUN_TEST(test_vec3_addition); + RUN_TEST(test_vec3_subtraction); + RUN_TEST(test_vec3_multiplication); + RUN_TEST(test_vec3_scalar_multiplication); + RUN_TEST(test_vec3_division); + RUN_TEST(test_vec3_negate); + RUN_TEST(test_vec3_abs); + RUN_TEST(test_vec3_max); + RUN_TEST(test_vec3_min); + RUN_TEST(test_vec3_normalized); + RUN_TEST(test_vec3_set_zero); + RUN_TEST(test_vec3_set); + RUN_TEST(test_vec3_set_vec); + RUN_TEST(test_vec3_operator_plus_equals); + RUN_TEST(test_vec3_operator_minus_equals); + RUN_TEST(test_vec3_operator_multiply_equals); + RUN_TEST(test_vec3_dot_product); + RUN_TEST(test_vec3_length); + RUN_TEST(test_vec3_length_sqr); + RUN_TEST(test_vec3_cross_product); + + // Vector 4D tests + RUN_TEST(test_vec4_addition); + RUN_TEST(test_vec4_subtraction); + RUN_TEST(test_vec4_scalar_multiplication); + + RUN_TEST(test_mat33_set_zero); + RUN_TEST(test_mat33_set_row); + RUN_TEST(test_mat33_get_row); + RUN_TEST(test_mat33_get_row1); + RUN_TEST(test_mat33_get_row2); + RUN_TEST(test_mat33_multiply_vector); + RUN_TEST(test_mat33_set_identity); + RUN_TEST(test_mat33_set_scale); + RUN_TEST(test_mat33_transpose); + + // Matrix 4x4 tests + RUN_TEST(test_mat44_set_zero); + RUN_TEST(test_mat44_set_row); + RUN_TEST(test_mat44_get_row1); + RUN_TEST(test_mat44_get_row2); + RUN_TEST(test_mat44_get_row3); + RUN_TEST(test_mat44_multiply_vector); + RUN_TEST(test_mat44_set_identity); + RUN_TEST(test_mat44_set_scale); + RUN_TEST(test_mat44_set_translate); + RUN_TEST(test_mat44_transpose); + + // Vector 4D additional tests + RUN_TEST(test_vec4_normalized3); + RUN_TEST(test_vec4_normalize3); + + // Vector 3D additional tests + RUN_TEST(test_vec3_normalized_zero); + RUN_TEST(test_vec3_truncate_length_shorter); + RUN_TEST(test_vec3_truncate_length_longer); + RUN_TEST(test_vec3_is_zero); + RUN_TEST(test_vec3_sign); + RUN_TEST(test_vec3_one_over); + RUN_TEST(test_vec3_interpolate); + RUN_TEST(test_vec3_distance_from); + RUN_TEST(test_vec3_set_length); + RUN_TEST(test_vec3_parallel_component); + RUN_TEST(test_vec3_perpendicular_component); + RUN_TEST(test_vec3_normalize); + +#ifndef USE_CPU_COMPAT + // Accumulator tests + RUN_TEST(test_accumulator_operations); + RUN_TEST(test_vec3_to_a_from_a); + RUN_TEST(test_vec3_aadd); + RUN_TEST(test_vec3_asub); + RUN_TEST(test_vec3_madd); + RUN_TEST(test_vec3_msub); + RUN_TEST(test_vec3_madda); + RUN_TEST(test_vec3_msuba); + RUN_TEST(test_vec3_aadda); + RUN_TEST(test_vec3_asuba); + + // VU0 copy function tests + RUN_TEST(test_vu0_copy_qwords); + RUN_TEST(test_vu0_copy_even_qwords); +#endif // !USE_CPU_COMPAT + + // mat_43 tests + RUN_TEST(test_mat43_set_zero); + RUN_TEST(test_mat43_get_rows); + RUN_TEST(test_mat43_multiply_vector); + RUN_TEST(test_mat43_transpose); + RUN_TEST(test_mat43_trans_mult_vec4); + RUN_TEST(test_mat43_trans_mult_vector_t); + RUN_TEST(test_mat43_trans_mult_point_t); + + // mat_34 tests + RUN_TEST(test_mat34_set_zero); + RUN_TEST(test_mat34_get_rows); + RUN_TEST(test_mat34_multiply_vector); + RUN_TEST(test_mat34_transpose); + RUN_TEST(test_mat34_trans_mult_vec3); + + // Matrix-matrix multiply tests + RUN_TEST(test_mat33_multiply_mat33); + RUN_TEST(test_mat33_multiply_mat33_nontrivial); + RUN_TEST(test_mat44_multiply_mat44); + + // Row-vector * matrix tests + RUN_TEST(test_vec3_multiply_mat33); + RUN_TEST(test_vec3_multiply_mat34); + RUN_TEST(test_vec4_multiply_mat43); + RUN_TEST(test_vec4_multiply_mat44); + + // mat_44 trans_mult tests + RUN_TEST(test_mat44_trans_mult_vec4); + RUN_TEST(test_mat44_trans_mult_vector_t); + RUN_TEST(test_mat44_trans_mult_point_t); + + // Matrix negate tests + RUN_TEST(test_mat33_negate); + RUN_TEST(test_mat44_negate); + RUN_TEST(test_mat43_negate); + RUN_TEST(test_mat34_negate); + + // mat_44 rotation tests + RUN_TEST(test_mat44_set_rotate_x); + RUN_TEST(test_mat44_set_rotate_z); + RUN_TEST(test_mat44_set_rotate_y); + + // mat_33 rotation tests + RUN_TEST(test_mat33_set_rotate_x); + RUN_TEST(test_mat33_set_rotate_y); + RUN_TEST(test_mat33_set_rotate_z); + + // mat_33 trans_mult and arithmetic + RUN_TEST(test_mat33_trans_mult_vec3); + RUN_TEST(test_mat33_trans_mult_mat33); + RUN_TEST(test_mat33_add); + RUN_TEST(test_mat33_subtract); + + // mat_33 quaternion constructor and mult_tilde + RUN_TEST(test_mat33_from_quat); + RUN_TEST(test_mat33_mult_tilde); + + // mat_44 arithmetic and products + RUN_TEST(test_mat44_add); + RUN_TEST(test_mat44_subtract); + RUN_TEST(test_mat44_trans_mult_mat44); + RUN_TEST(test_mat44_multiply_mat43); + + // mat_43 arithmetic and products + RUN_TEST(test_mat43_add); + RUN_TEST(test_mat43_subtract); + RUN_TEST(test_mat43_multiply_mat33); + RUN_TEST(test_mat43_mult_tilde); + + // mat_34 arithmetic and products + RUN_TEST(test_mat34_add); + RUN_TEST(test_mat34_subtract); + RUN_TEST(test_mat34_multiply_mat43); + RUN_TEST(test_mat34_multiply_mat44); + + // mat_33::inverse, vec_xyz tilde, transform_t operations + RUN_TEST(test_mat33_inverse); + RUN_TEST(test_vec_xyz_tilde); + RUN_TEST(test_transform_mult_vec4); + RUN_TEST(test_transform_mult_vector); + RUN_TEST(test_transform_mult_point); + RUN_TEST(test_transform_inverse); + RUN_TEST(test_transform_orthonormal_inverse); + RUN_TEST(test_transform_orthonormal_inverse_in_place); + +#ifndef USE_CPU_COMPAT + printf("\n--- cpu_vec_4 / cpu_mat_44 VU0 asm fast paths ---\n\n"); + RUN_TEST(test_cpu_vec4_vu0_addition); + RUN_TEST(test_cpu_vec4_vu0_addition_signed); + RUN_TEST(test_cpu_vec4_vu0_subtraction); + RUN_TEST(test_cpu_vec4_vu0_negation); + RUN_TEST(test_cpu_vec4_vu0_negation_zero); + RUN_TEST(test_cpu_vec4_vu0_component_mul); + RUN_TEST(test_cpu_vec4_vu0_component_mul_signed); + RUN_TEST(test_cpu_mat44_vu0_mult_vector_identity); + RUN_TEST(test_cpu_mat44_vu0_mult_vector_diag); + RUN_TEST(test_cpu_mat44_vu0_mult_vector_affine); + RUN_TEST(test_cpu_mat44_vu0_mult_vector_general); + RUN_TEST(test_cpu_mat44_vu0_mult_mat_identity); + RUN_TEST(test_cpu_mat44_vu0_mult_mat_scale_translate); + RUN_TEST(test_cpu_mat44_vu0_mult_mat_general); +#endif // !USE_CPU_COMPAT + +#ifdef USE_CPU_COMPAT + printf("\n--- CPU legacy types (cpu_vec_3, cpu_vec_4, cpu_mat_44) ---\n\n"); + RUN_TEST(test_cpu_vec3_construction); + RUN_TEST(test_cpu_vec3_set); + RUN_TEST(test_cpu_vec3_addition); + RUN_TEST(test_cpu_vec3_subtraction); + RUN_TEST(test_cpu_vec3_negation); + RUN_TEST(test_cpu_vec3_scalar_mul); + RUN_TEST(test_cpu_vec3_dot); + RUN_TEST(test_cpu_vec3_cross); + RUN_TEST(test_cpu_vec3_cross_general); + RUN_TEST(test_cpu_vec3_normalized); + RUN_TEST(test_cpu_vec3_normalize); + RUN_TEST(test_cpu_vec3_index_operator); + RUN_TEST(test_cpu_vec4_construction); + RUN_TEST(test_cpu_vec4_set); + RUN_TEST(test_cpu_vec4_addition); + RUN_TEST(test_cpu_vec4_subtraction); + RUN_TEST(test_cpu_vec4_negation); + RUN_TEST(test_cpu_vec4_component_mul); + RUN_TEST(test_cpu_vec4_scalar_mul); + RUN_TEST(test_cpu_vec4_dot); + RUN_TEST(test_cpu_vec4_length); + RUN_TEST(test_cpu_vec4_normalized); + RUN_TEST(test_cpu_vec4_normalize); + RUN_TEST(test_cpu_mat44_set_identity); + RUN_TEST(test_cpu_mat44_col_access); + RUN_TEST(test_cpu_mat44_mult_vector_identity); + RUN_TEST(test_cpu_mat44_mult_vector); + RUN_TEST(test_cpu_mat44_set_scale); + RUN_TEST(test_cpu_mat44_set_translate); + RUN_TEST(test_cpu_mat44_set_rotate_z); + RUN_TEST(test_cpu_mat44_set_rotate_x); + RUN_TEST(test_cpu_mat44_transpose); + RUN_TEST(test_cpu_mat44_multiply_mat); +#endif // USE_CPU_COMPAT + + printf("=== All Tests Completed ===\n"); + printf("Total tests run: %d\n", test_count); + + if (failed_count > 0) { + printf("\n=== FAILED TESTS ===\n"); + printf("Total failed: %d\n", failed_count); + for (int i = 0; i < failed_count; i++) { + printf(" - %s\n", failed_tests[i]); + } + printf("\n"); + } else { + printf("All tests passed!\n"); + } + + SleepThread(); + + return (failed_count > 0) ? 1 : 0; +} +