From 3f7f4c0a52894280438f095d8d2e564d2d67dfee Mon Sep 17 00:00:00 2001
From: Francisco Javier Trujillo Mata <fjtrujy@gmail.com>
Date: Wed, 5 Nov 2025 11:29:55 +0100
Subject: [PATCH] Use VU0_VECTORs and ASM code

cmake: enable VU0/ASM options and wire test execution

docs(vu0): add emulation caveats and tests README note (fobes.dev)

docs(vu0): add copilot-instructions and consolidate VU0 sources

tests: add VU mul emulator-detection helper and global flag

tmp
---
 .github/workflows/compilation.yml |    5 +-
 .gitignore                        |    2 +-
 CMAKE_BUILD.md                    |   20 +-
 CMakeLists.txt                    |   18 +-
 include/ps2s/cpu_compat_types.h   |  865 ++++++++++
 include/ps2s/cpu_matrix.h         |   65 +
 include/ps2s/cpu_vector.h         |   86 +-
 include/ps2s/matrix.h             | 1340 ++++++++-------
 include/ps2s/matrix_common.h      |  870 ++++++----
 include/ps2s/vector.h             |  273 ++--
 include/ps2s/vector_common.h      |  733 +++++----
 include/ps2s/vu.h                 |  168 +-
 tests/CMakeLists.txt              |   55 +-
 tests/README.md                   |   85 +
 tests/test_shared.cpp             | 2546 +++++++++++++++++++++++++++++
 15 files changed, 5663 insertions(+), 1468 deletions(-)
 create mode 100644 include/ps2s/cpu_compat_types.h
 create mode 100644 tests/README.md
 create mode 100644 tests/test_shared.cpp

diff --git a/.github/workflows/compilation.yml b/.github/workflows/compilation.yml
index ec5b372..a78b2b8 100644
--- a/.github/workflows/compilation.yml
+++ b/.github/workflows/compilation.yml
@@ -19,11 +19,11 @@ jobs:
         apk update
         apk add cmake build-base git make
 
-    - name: Configure with CMake
+    - name: Configure with CMake (library + tests)
       run: |
         mkdir build
         cd build
-        cmake ..
+        cmake -DBUILD_TESTS=ON ..
 
     - name: Build project with CMake
       run: |
@@ -46,3 +46,4 @@ jobs:
         name: libps2stuff-${{ steps.slug.outputs.sha8 }}
         path: |
           build/*.a
+          build/tests/*.elf
diff --git a/.gitignore b/.gitignore
index fdcc877..76f6fc6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@ objs_*
 prebuilddone
 *.o
 *.a
-
 # CMake
 build/
 CMakeCache.txt
@@ -12,3 +11,4 @@ cmake_install.cmake
 install_manifest.txt
 *.cmake
 !CMakeLists.txt
+*.elf
diff --git a/CMAKE_BUILD.md b/CMAKE_BUILD.md
index 968bb4a..cba521c 100644
--- a/CMAKE_BUILD.md
+++ b/CMAKE_BUILD.md
@@ -30,13 +30,19 @@ make
 
 ### Building with Tests
 
-To build the test executables (when available):
+To build the test executables:
 
 ```bash
 cmake -DBUILD_TESTS=ON ..
 make
 ```
 
+If `ps2client` is installed and available in `PATH`, you can run:
+
+```bash
+make run_test_vu0
+```
+
 ## Installing
 
 To install the library and headers to `$PS2SDK/ports`:
@@ -57,16 +63,22 @@ The following CMake options are available:
 |--------|---------|-------------|
 | `DEBUG` | OFF | Enable debug build with `_DEBUG` definition |
 | `BUILD_TESTS` | OFF | Build test executables |
+| `ENABLE_VU0_VECTORS` | ON | Enable VU0 vector code paths |
+| `ENABLE_ASM` | ON | Enable assembly optimizations |
 
 ## Build Flags
 
-The CMake build automatically applies the following flags:
+The CMake build automatically applies the following warning flags:
 
-- `-DNO_VU0_VECTORS` - Disables VU0 vector code (currently broken)
-- `-DNO_ASM` - Disables assembly optimizations
 - `-Wno-strict-aliasing` - Suppresses strict aliasing warnings
 - `-Wno-conversion-null` - Suppresses conversion null warnings
 
+To disable VU0 vectors or assembly paths, configure with:
+
+```bash
+cmake -DENABLE_VU0_VECTORS=OFF -DENABLE_ASM=OFF ..
+```
+
 ## CMake Toolchain
 
 The build uses the PS2DEV CMake toolchain file located at:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f8b1c3..4fbbd84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,8 @@ project(ps2stuff VERSION 1.0.0 LANGUAGES CXX C)
 # Options
 option(BUILD_TESTS "Build test projects" OFF)
 option(DEBUG "Enable debug build" OFF)
+option(ENABLE_VU0_VECTORS "Enable VU0 vector code paths" ON)
+option(ENABLE_ASM "Enable assembly optimizations" ON)
 
 # Check if PS2SDK is set (should be done by toolchain file)
 if(NOT DEFINED PS2SDK)
@@ -45,11 +47,13 @@ set(WARNING_FLAGS
     -Wno-conversion-null
 )
 
-# VU0 code is broken so disable for now
-add_compile_definitions(
-    NO_VU0_VECTORS
-    NO_ASM
-)
+if(NOT ENABLE_VU0_VECTORS)
+    add_compile_definitions(NO_VU0_VECTORS)
+endif()
+
+if(NOT ENABLE_ASM)
+    add_compile_definitions(NO_ASM)
+endif()
 
 add_compile_options(${WARNING_FLAGS})
 
@@ -122,6 +126,6 @@ message(STATUS "  Debug build: ${DEBUG}")
 message(STATUS "  Build tests: ${BUILD_TESTS}")
 message(STATUS "  Output library: ${EE_LIB}")
 message(STATUS "  Install prefix: ${PS2SDK}/ports")
-message(STATUS "  VU0 vectors: DISABLED")
-message(STATUS "  ASM optimizations: DISABLED")
+message(STATUS "  VU0 vectors: ${ENABLE_VU0_VECTORS}")
+message(STATUS "  ASM optimizations: ${ENABLE_ASM}")
 message(STATUS "")
diff --git a/include/ps2s/cpu_compat_types.h b/include/ps2s/cpu_compat_types.h
new file mode 100644
index 0000000..9b7572e
--- /dev/null
+++ b/include/ps2s/cpu_compat_types.h
@@ -0,0 +1,865 @@
+/*
+ * cpu_compat_types.h - CPU-only implementations of VU0 vector/matrix types.
+ *
+ * Provides the same class names and API as vector.h/matrix.h but uses
+ * standard C++ float arithmetic.  Compile with -DUSE_CPU_COMPAT to select
+ * this path; the 12 VU0-specific tests (accumulator + DMA copy) are then
+ * excluded from the test binary at compile time.
+ */
+
+#pragma once
+
+#include <math.h>
+
+/* ========================================================================
+ * Forward declarations
+ * ======================================================================== */
+
+struct vec_xyz;
+struct vec_xyzw;
+struct mat_33;
+struct mat_34;
+struct mat_43;
+struct mat_44;
+struct transform_t;
+
+/* ========================================================================
+ * Single-component wrappers   (vec_x, vec_y, vec_z, vec_w)
+ *
+ * Used by ASSERT_VEC_EQ: (float)vec_x(v) extracts the x component and
+ * converts to float.
+ * ======================================================================== */
+
+struct vec_x {
+    float v;
+    explicit vec_x(float f) : v(f) {}
+    vec_x(const vec_xyz& t);
+    vec_x(const vec_xyzw& t);
+    operator float() const { return v; }
+};
+
+struct vec_y {
+    float v;
+    explicit vec_y(float f) : v(f) {}
+    vec_y(const vec_xyz& t);
+    vec_y(const vec_xyzw& t);
+    operator float() const { return v; }
+};
+
+struct vec_z {
+    float v;
+    explicit vec_z(float f) : v(f) {}
+    vec_z(const vec_xyz& t);
+    vec_z(const vec_xyzw& t);
+    operator float() const { return v; }
+};
+
+struct vec_w {
+    float v;
+    explicit vec_w(float f) : v(f) {}
+    vec_w(const vec_xyz& t);
+    vec_w(const vec_xyzw& t);
+    operator float() const { return v; }
+};
+
+/* ========================================================================
+ * vec_xyz  (3D float vector)
+ * ======================================================================== */
+
+struct vec_xyz {
+    float x, y, z;
+
+    vec_xyz() : x(0.0f), y(0.0f), z(0.0f) {}
+    vec_xyz(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+    // Conversion from vec_xyzw (drops w)
+    vec_xyz(const vec_xyzw& v);
+
+    void set(float s) { x = s; y = s; z = s; }
+    void set(const vec_xyz& v) { x = v.x; y = v.y; z = v.z; }
+    void set(float x_, float y_, float z_) { x = x_; y = y_; z = z_; }
+    void set_zero() { x = 0.0f; y = 0.0f; z = 0.0f; }
+
+    vec_xyz operator+(const vec_xyz& r) const { return {x+r.x, y+r.y, z+r.z}; }
+    vec_xyz operator-(const vec_xyz& r) const { return {x-r.x, y-r.y, z-r.z}; }
+    vec_xyz operator*(const vec_xyz& r) const { return {x*r.x, y*r.y, z*r.z}; }
+    vec_xyz operator/(const vec_xyz& r) const { return {x/r.x, y/r.y, z/r.z}; }
+    vec_xyz operator*(float s)          const { return {x*s,   y*s,   z*s};   }
+    vec_xyz operator/(float s)          const { float r=1.0f/s; return {x*r, y*r, z*r}; }
+    vec_xyz operator-()                 const { return {-x, -y, -z}; }
+
+    vec_xyz& operator+=(const vec_xyz& r) { x+=r.x; y+=r.y; z+=r.z; return *this; }
+    vec_xyz& operator-=(const vec_xyz& r) { x-=r.x; y-=r.y; z-=r.z; return *this; }
+    vec_xyz& operator*=(const vec_xyz& r) { x*=r.x; y*=r.y; z*=r.z; return *this; }
+
+    vec_xyz abs()             const { return {fabsf(x), fabsf(y), fabsf(z)}; }
+    vec_xyz max(const vec_xyz& r) const {
+        return {x > r.x ? x : r.x,
+                y > r.y ? y : r.y,
+                z > r.z ? z : r.z};
+    }
+    vec_xyz min(const vec_xyz& r) const {
+        return {x < r.x ? x : r.x,
+                y < r.y ? y : r.y,
+                z < r.z ? z : r.z};
+    }
+    vec_xyz sign() const {
+        return {x > 0.0f ? 1.0f : (x < 0.0f ? -1.0f : 0.0f),
+                y > 0.0f ? 1.0f : (y < 0.0f ? -1.0f : 0.0f),
+                z > 0.0f ? 1.0f : (z < 0.0f ? -1.0f : 0.0f)};
+    }
+    vec_xyz one_over() const { return {1.0f/x, 1.0f/y, 1.0f/z}; }
+
+    float dot(const vec_xyz& r) const { return x*r.x + y*r.y + z*r.z; }
+    float length_sqr()           const { return x*x + y*y + z*z; }
+    float length()               const { return sqrtf(length_sqr()); }
+
+    vec_xyz normalized() const {
+        float len = length();
+        if (len == 0.0f) return {0.0f, 0.0f, 0.0f};
+        float inv = 1.0f / len;
+        return {x*inv, y*inv, z*inv};
+    }
+    vec_xyz& normalize() { *this = normalized(); return *this; }
+
+    vec_xyz cross(const vec_xyz& r) const {
+        return {y*r.z - z*r.y,
+                z*r.x - x*r.z,
+                x*r.y - y*r.x};
+    }
+
+    int is_zero() const { return (x == 0.0f && y == 0.0f && z == 0.0f) ? 1 : 0; }
+
+    vec_xyz set_length(float len) const { return normalized() * len; }
+
+    vec_xyz truncate_length(float maxLen) const {
+        float len = length();
+        if (len <= maxLen || len == 0.0f) return *this;
+        float scale = maxLen / len;
+        return {x*scale, y*scale, z*scale};
+    }
+
+    vec_xyz interpolate(float alpha, const vec_xyz& other) const {
+        return {x + alpha*(other.x - x),
+                y + alpha*(other.y - y),
+                z + alpha*(other.z - z)};
+    }
+
+    float distance_from(const vec_xyz& r) const { return (*this - r).length(); }
+
+    // Parallel component of *this along unit vector basis
+    vec_xyz parallel_component(const vec_xyz& unitBasis) const {
+        return unitBasis * dot(unitBasis);
+    }
+    // Perpendicular component of *this to unit vector basis
+    vec_xyz perpendicular_component(const vec_xyz& unitBasis) const {
+        return *this - parallel_component(unitBasis);
+    }
+
+    // Index access
+    float& operator[](int i)       { return (&x)[i]; }
+    float  operator[](int i) const { return (&x)[i]; }
+};
+
+// Scalar-first multiply
+inline vec_xyz operator*(float s, const vec_xyz& v) { return v * s; }
+
+typedef vec_xyz vec_3;
+
+/* ========================================================================
+ * vec_xyzw  (4D float vector)
+ * ======================================================================== */
+
+struct vec_xyzw {
+    float x, y, z, w;
+
+    vec_xyzw() : x(0.0f), y(0.0f), z(0.0f), w(0.0f) {}
+    vec_xyzw(float x_, float y_, float z_, float w_) : x(x_), y(y_), z(z_), w(w_) {}
+    // 3-arg ctor sets w=0
+    vec_xyzw(float x_, float y_, float z_) : x(x_), y(y_), z(z_), w(0.0f) {}
+    // Construct from vec_xyz (w=0)
+    vec_xyzw(const vec_xyz& v) : x(v.x), y(v.y), z(v.z), w(0.0f) {}
+
+    void set(float x_, float y_, float z_, float w_) { x=x_; y=y_; z=z_; w=w_; }
+    void set(float x_, float y_, float z_) { x=x_; y=y_; z=z_; }
+    void set_zero() { x=0.0f; y=0.0f; z=0.0f; w=0.0f; }
+
+    vec_xyzw operator+(const vec_xyzw& r) const { return {x+r.x, y+r.y, z+r.z, w+r.w}; }
+    vec_xyzw operator-(const vec_xyzw& r) const { return {x-r.x, y-r.y, z-r.z, w-r.w}; }
+    vec_xyzw operator*(const vec_xyzw& r) const { return {x*r.x, y*r.y, z*r.z, w*r.w}; }
+    vec_xyzw operator*(float s)           const { return {x*s, y*s, z*s, w*s}; }
+    vec_xyzw operator-()                  const { return {-x, -y, -z, -w}; }
+
+    vec_xyzw& operator+=(const vec_xyzw& r) { x+=r.x; y+=r.y; z+=r.z; w+=r.w; return *this; }
+    vec_xyzw& operator-=(const vec_xyzw& r) { x-=r.x; y-=r.y; z-=r.z; w-=r.w; return *this; }
+
+    float dot(const vec_xyzw& r) const { return x*r.x + y*r.y + z*r.z + w*r.w; }
+    float dot3(const vec_xyz& r)  const { return x*r.x + y*r.y + z*r.z; }
+
+    vec_xyzw normalized3() const {
+        float len = sqrtf(x*x + y*y + z*z);
+        if (len == 0.0f) return *this;
+        float inv = 1.0f / len;
+        return {x*inv, y*inv, z*inv, w};
+    }
+    vec_xyzw& normalize3() { *this = normalized3(); return *this; }
+
+    // Index access
+    float& operator[](int i)       { return (&x)[i]; }
+    float  operator[](int i) const { return (&x)[i]; }
+};
+
+inline vec_xyzw operator*(float s, const vec_xyzw& v) { return v * s; }
+
+typedef vec_xyzw vec_4;
+
+/* ========================================================================
+ * vector_t  (direction: treat w as 0)
+ * point_t   (position:  treat w as 1)
+ * Both stored as vec_xyz; w is implicit.
+ * ======================================================================== */
+
+struct vector_t : public vec_xyz {
+    vector_t() : vec_xyz() {}
+    vector_t(float x_, float y_, float z_) : vec_xyz(x_, y_, z_) {}
+    vector_t(const vec_xyz& v) : vec_xyz(v) {}
+};
+
+struct point_t : public vec_xyz {
+    point_t() : vec_xyz() {}
+    point_t(float x_, float y_, float z_) : vec_xyz(x_, y_, z_) {}
+    point_t(const vec_xyz& v) : vec_xyz(v) {}
+};
+
+/* ========================================================================
+ * Deferred single-component conversions (need full vec_xyz/vec_xyzw)
+ * ======================================================================== */
+
+inline vec_x::vec_x(const vec_xyz&  t) : v(t.x) {}
+inline vec_x::vec_x(const vec_xyzw& t) : v(t.x) {}
+inline vec_y::vec_y(const vec_xyz&  t) : v(t.y) {}
+inline vec_y::vec_y(const vec_xyzw& t) : v(t.y) {}
+inline vec_z::vec_z(const vec_xyz&  t) : v(t.z) {}
+inline vec_z::vec_z(const vec_xyzw& t) : v(t.z) {}
+inline vec_w::vec_w(const vec_xyz&  /*t*/) : v(0.0f) {}
+inline vec_w::vec_w(const vec_xyzw& t)     : v(t.w) {}
+
+inline vec_xyz::vec_xyz(const vec_xyzw& v) : x(v.x), y(v.y), z(v.z) {}
+
+/* ========================================================================
+ * mat_33  (3×3 column-major matrix)
+ *
+ * Storage: col0, col1, col2  (each vec_xyz = one column)
+ * Matrix M acts on column vector v as:  M*v = col0*v.x + col1*v.y + col2*v.z
+ * ======================================================================== */
+
+struct mat_33 {
+    vec_xyz col0, col1, col2;
+
+    // Default constructor – initialises to identity
+    mat_33() { set_identity(); }
+
+    // Construct from 3 column vectors
+    mat_33(const vec_xyz& c0, const vec_xyz& c1, const vec_xyz& c2)
+        : col0(c0), col1(c1), col2(c2) {}
+
+    // Construct from unit quaternion (x,y,z,w)
+    explicit mat_33(const vec_xyzw& q) {
+        float qx = q.x, qy = q.y, qz = q.z, qw = q.w;
+        col0 = { 1.0f-2.0f*(qy*qy+qz*qz),  2.0f*(qx*qy+qz*qw),  2.0f*(qx*qz-qy*qw) };
+        col1 = { 2.0f*(qx*qy-qz*qw),        1.0f-2.0f*(qx*qx+qz*qz),  2.0f*(qy*qz+qx*qw) };
+        col2 = { 2.0f*(qx*qz+qy*qw),        2.0f*(qy*qz-qx*qw),  1.0f-2.0f*(qx*qx+qy*qy) };
+    }
+
+    void set_zero()     { col0.set_zero(); col1.set_zero(); col2.set_zero(); }
+    void set_identity() { col0={1,0,0}; col1={0,1,0}; col2={0,0,1}; }
+    void set_scale(const vec_xyz& s) {
+        col0={s.x,0,0}; col1={0,s.y,0}; col2={0,0,s.z};
+    }
+
+    // Row access (each row is the i-th elements of all columns)
+    void set_row0(const vec_xyz& r) { col0.x=r.x; col1.x=r.y; col2.x=r.z; }
+    void set_row1(const vec_xyz& r) { col0.y=r.x; col1.y=r.y; col2.y=r.z; }
+    void set_row2(const vec_xyz& r) { col0.z=r.x; col1.z=r.y; col2.z=r.z; }
+
+    vec_xyz get_row0() const { return {col0.x, col1.x, col2.x}; }
+    vec_xyz get_row1() const { return {col0.y, col1.y, col2.y}; }
+    vec_xyz get_row2() const { return {col0.z, col1.z, col2.z}; }
+
+    vec_xyz get_col0() const { return col0; }
+    vec_xyz get_col1() const { return col1; }
+    vec_xyz get_col2() const { return col2; }
+
+    // Matrix-vector product: M * v
+    vec_xyz operator*(const vec_xyz& v) const {
+        return col0*v.x + col1*v.y + col2*v.z;
+    }
+
+    // Transpose-multiply: M^T * v = (dot(col0,v), dot(col1,v), dot(col2,v))
+    vec_xyz trans_mult(const vec_xyz& v) const {
+        return {col0.dot(v), col1.dot(v), col2.dot(v)};
+    }
+
+    mat_33 transpose() const {
+        mat_33 r;
+        r.col0 = get_row0(); r.col1 = get_row1(); r.col2 = get_row2();
+        return r;
+    }
+
+    mat_33 operator-() const {
+        mat_33 r; r.col0=-col0; r.col1=-col1; r.col2=-col2; return r;
+    }
+
+    mat_33 operator+(const mat_33& m) const {
+        mat_33 r; r.col0=col0+m.col0; r.col1=col1+m.col1; r.col2=col2+m.col2; return r;
+    }
+    mat_33 operator-(const mat_33& m) const {
+        mat_33 r; r.col0=col0-m.col0; r.col1=col1-m.col1; r.col2=col2-m.col2; return r;
+    }
+
+    // Matrix product: (A*B).col_i = A * B.col_i
+    mat_33 operator*(const mat_33& m) const {
+        mat_33 r;
+        r.col0 = *this * m.col0;
+        r.col1 = *this * m.col1;
+        r.col2 = *this * m.col2;
+        return r;
+    }
+
+    // Transpose-multiply: M^T * A
+    mat_33 trans_mult(const mat_33& m) const {
+        // (M^T * A).col_i = M^T * A.col_i = (dot(col0,A.col_i), dot(col1,A.col_i), dot(col2,A.col_i))
+        mat_33 r;
+        r.col0 = trans_mult(m.col0);
+        r.col1 = trans_mult(m.col1);
+        r.col2 = trans_mult(m.col2);
+        return r;
+    }
+
+    void set_rotate_x(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={1,0,0}; col1={0,c,s}; col2={0,-s,c};
+    }
+    void set_rotate_y(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={c,0,-s}; col1={0,1,0}; col2={s,0,c};
+    }
+    void set_rotate_z(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={c,s,0}; col1={-s,c,0}; col2={0,0,1};
+    }
+
+    // mult_tilde: M * [v]× where [v]× is the skew-symmetric matrix
+    // [v]×: col0=(0,vz,-vy), col1=(-vz,0,vx), col2=(vy,-vx,0)
+    mat_33 mult_tilde(const vec_xyz& v) const {
+        mat_33 r;
+        r.col0 = *this * vec_xyz( 0.0f,  v.z, -v.y);
+        r.col1 = *this * vec_xyz(-v.z,  0.0f,  v.x);
+        r.col2 = *this * vec_xyz( v.y, -v.x,  0.0f);
+        return r;
+    }
+
+    // 3×3 inverse via cofactors (Cramer's rule).
+    // Column-major names: a=col0.x=M[0][0], d=col0.y=M[1][0], g=col0.z=M[2][0]
+    //                     b=col1.x=M[0][1], e=col1.y=M[1][1], h=col1.z=M[2][1]
+    //                     c=col2.x=M[0][2], f=col2.y=M[1][2], i=col2.z=M[2][2]
+    // inv.col_k = (C[0][k], C[1][k], C[2][k]) / det  (adjugate row k)
+    mat_33 inverse() const {
+        float a=col0.x, d=col0.y, g=col0.z;
+        float b=col1.x, e=col1.y, h=col1.z;
+        float c=col2.x, f=col2.y, ii=col2.z;
+        float det = a*(e*ii - f*h) - b*(d*ii - f*g) + c*(d*h - e*g);
+        float inv_det = 1.0f / det;
+        mat_33 r;
+        r.col0.x = ( e*ii - f*h)  * inv_det;
+        r.col0.y = -(d*ii - f*g)  * inv_det;
+        r.col0.z = ( d*h  - e*g)  * inv_det;
+        r.col1.x = -(b*ii - c*h)  * inv_det;
+        r.col1.y = ( a*ii - c*g)  * inv_det;
+        r.col1.z = -(a*h  - b*g)  * inv_det;
+        r.col2.x = ( b*f  - c*e)  * inv_det;
+        r.col2.y = -(a*f  - c*d)  * inv_det;
+        r.col2.z = ( a*e  - b*d)  * inv_det;
+        return r;
+    }
+};
+
+// Skew-symmetric matrix operator: ~v creates [v]×
+inline mat_33 operator~(const vec_xyz& v) {
+    mat_33 r;
+    r.col0 = { 0.0f,  v.z, -v.y};
+    r.col1 = {-v.z,  0.0f,  v.x};
+    r.col2 = { v.y,  -v.x, 0.0f};
+    return r;
+}
+
+// Row-vector * matrix: v^T * M = M^T * v
+inline vec_xyz operator*(const vec_xyz& v, const mat_33& m) {
+    return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2)};
+}
+
+/* ========================================================================
+ * mat_43  (4 rows × 3 columns, column-major)
+ *
+ * Storage: col0, col1, col2  (each vec_xyzw = column with 4 floats)
+ * Product: mat_43 * vec_3 = vec_4  (result = col0*v.x + col1*v.y + col2*v.z)
+ * ======================================================================== */
+
+struct mat_43 {
+    vec_xyzw col0, col1, col2;
+
+    mat_43() { set_zero(); }
+    mat_43(const vec_xyzw& c0, const vec_xyzw& c1, const vec_xyzw& c2)
+        : col0(c0), col1(c1), col2(c2) {}
+
+    void set_zero() { col0.set_zero(); col1.set_zero(); col2.set_zero(); }
+
+    vec_xyz get_row0() const { return {col0.x, col1.x, col2.x}; }
+    vec_xyz get_row1() const { return {col0.y, col1.y, col2.y}; }
+    vec_xyz get_row2() const { return {col0.z, col1.z, col2.z}; }
+    vec_xyz get_row3() const { return {col0.w, col1.w, col2.w}; }
+
+    vec_xyzw get_col0() const { return col0; }
+    vec_xyzw get_col1() const { return col1; }
+    vec_xyzw get_col2() const { return col2; }
+
+    // mat_43 * vec_3 = vec_4
+    vec_xyzw operator*(const vec_xyz& v) const {
+        vec_xyzw r;
+        r.x = col0.x*v.x + col1.x*v.y + col2.x*v.z;
+        r.y = col0.y*v.x + col1.y*v.y + col2.y*v.z;
+        r.z = col0.z*v.x + col1.z*v.y + col2.z*v.z;
+        r.w = col0.w*v.x + col1.w*v.y + col2.w*v.z;
+        return r;
+    }
+
+    // Transpose-multiply: (mat_43)^T * vec_4 = vec_3
+    // (mat_43^T).row_i = col_i, so result_i = dot(col_i, v)
+    vec_xyz trans_mult(const vec_xyzw& v) const {
+        return {col0.dot(v), col1.dot(v), col2.dot(v)};
+    }
+    vec_xyz trans_mult(const vector_t& v) const {
+        vec_xyzw v4(v.x, v.y, v.z, 0.0f);
+        return trans_mult(v4);
+    }
+    vec_xyz trans_mult(const point_t& v) const {
+        vec_xyzw v4(v.x, v.y, v.z, 1.0f);
+        return trans_mult(v4);
+    }
+
+    mat_34 transpose() const; // defined after mat_34 is complete
+
+    mat_43 operator-() const {
+        return {-col0, -col1, -col2};
+    }
+    mat_43 operator+(const mat_43& m) const {
+        return {col0+m.col0, col1+m.col1, col2+m.col2};
+    }
+    mat_43 operator-(const mat_43& m) const {
+        return {col0-m.col0, col1-m.col1, col2-m.col2};
+    }
+
+    // mat_43 * mat_33 = mat_43
+    // result.col_i = (mat_43) * (mat_33).col_i
+    mat_43 operator*(const mat_33& m) const {
+        mat_43 r;
+        r.col0 = *this * m.col0;
+        r.col1 = *this * m.col1;
+        r.col2 = *this * m.col2;
+        return r;
+    }
+
+    // mult_tilde: (mat_43) * [v]× where [v]× cols are vec_3
+    // [v]×: col0=(0,vz,-vy), col1=(-vz,0,vx), col2=(vy,-vx,0)
+    mat_43 mult_tilde(const vec_xyz& v) const {
+        mat_43 r;
+        r.col0 = *this * vec_xyz( 0.0f,  v.z, -v.y);
+        r.col1 = *this * vec_xyz(-v.z,  0.0f,  v.x);
+        r.col2 = *this * vec_xyz( v.y, -v.x,  0.0f);
+        return r;
+    }
+};
+
+/* ========================================================================
+ * mat_34  (3 rows × 4 columns, column-major)
+ *
+ * Storage: col0, col1, col2, col3  (each vec_xyz = column with 3 floats)
+ * Product: mat_34 * vec_4 = vec_3  (result = col0*v.x + col1*v.y + col2*v.z + col3*v.w)
+ * ======================================================================== */
+
+struct mat_34 {
+    vec_xyz col0, col1, col2, col3;
+
+    mat_34() { set_zero(); }
+    mat_34(const vec_xyz& c0, const vec_xyz& c1, const vec_xyz& c2, const vec_xyz& c3)
+        : col0(c0), col1(c1), col2(c2), col3(c3) {}
+
+    void set_zero() { col0.set_zero(); col1.set_zero(); col2.set_zero(); col3.set_zero(); }
+
+    vec_xyzw get_row0() const { return {col0.x, col1.x, col2.x, col3.x}; }
+    vec_xyzw get_row1() const { return {col0.y, col1.y, col2.y, col3.y}; }
+    vec_xyzw get_row2() const { return {col0.z, col1.z, col2.z, col3.z}; }
+
+    // mat_34 * vec_4 = vec_3
+    vec_xyz operator*(const vec_xyzw& v) const {
+        return col0*v.x + col1*v.y + col2*v.z + col3*v.w;
+    }
+    vec_xyz operator*(const vector_t& v) const {
+        return col0*v.x + col1*v.y + col2*v.z;
+    }
+    vec_xyz operator*(const point_t& v) const {
+        return col0*v.x + col1*v.y + col2*v.z + col3;
+    }
+
+    // Transpose-multiply: (mat_34)^T * vec_3 = vec_4
+    // (mat_34^T).row_i = col_i, so result_i = dot(col_i, v)
+    vec_xyzw trans_mult(const vec_xyz& v) const {
+        return {col0.dot(v), col1.dot(v), col2.dot(v), col3.dot(v)};
+    }
+
+    mat_43 transpose() const; // defined after mat_43 is complete
+
+    mat_34 operator-() const {
+        return {-col0, -col1, -col2, -col3};
+    }
+    mat_34 operator+(const mat_34& m) const {
+        return {col0+m.col0, col1+m.col1, col2+m.col2, col3+m.col3};
+    }
+    mat_34 operator-(const mat_34& m) const {
+        return {col0-m.col0, col1-m.col1, col2-m.col2, col3-m.col3};
+    }
+
+    // mat_34 * mat_43 = mat_33
+    // result.col_i = (mat_34) * (mat_43).col_i   [mat_34 * vec_4]
+    mat_33 operator*(const mat_43& m) const {
+        mat_33 r;
+        r.col0 = *this * m.col0;
+        r.col1 = *this * m.col1;
+        r.col2 = *this * m.col2;
+        return r;
+    }
+
+    // mat_34 * mat_44 = mat_34
+    mat_34 operator*(const mat_44& m) const; // defined after mat_44
+};
+
+/* ========================================================================
+ * mat_43::transpose() returns mat_34 – defined here after mat_34 is declared
+ * ======================================================================== */
+
+inline mat_34 mat_43_to_mat_34(const mat_43& m) {
+    mat_34 r;
+    // Row i of mat_43 = (col0[i], col1[i], col2[i])
+    // Col j of mat_34 (= row j of mat_43):
+    r.col0 = { m.col0.x, m.col1.x, m.col2.x };  // row0 as vec_3
+    r.col1 = { m.col0.y, m.col1.y, m.col2.y };  // row1 as vec_3
+    r.col2 = { m.col0.z, m.col1.z, m.col2.z };  // row2 as vec_3
+    r.col3 = { m.col0.w, m.col1.w, m.col2.w };  // row3 as vec_3
+    return r;
+}
+
+inline mat_43 mat_34_to_mat_43(const mat_34& m) {
+    mat_43 r;
+    // Transpose of mat_34 (3x4) = mat_43 (4x3)
+    // col0 of result = row0 of mat_34 = (col0.x, col1.x, col2.x, col3.x)
+    r.col0 = { m.col0.x, m.col1.x, m.col2.x, m.col3.x };
+    r.col1 = { m.col0.y, m.col1.y, m.col2.y, m.col3.y };
+    r.col2 = { m.col0.z, m.col1.z, m.col2.z, m.col3.z };
+    return r;
+}
+
+// Row-vector * mat_43: vec_4^T * mat_43 = vec_3
+inline vec_xyz operator*(const vec_xyzw& v, const mat_43& m) {
+    return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2)};
+}
+
+// Row-vector * mat_34: vec_3^T * mat_34 = vec_4
+inline vec_xyzw operator*(const vec_xyz& v, const mat_34& m) {
+    return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2), v.dot(m.col3)};
+}
+
+/* ========================================================================
+ * mat_44  (4×4 column-major matrix)
+ * ======================================================================== */
+
+struct mat_44 {
+    vec_xyzw col0, col1, col2, col3;
+
+    mat_44() { set_identity(); }
+
+    // Construct from 4 column vectors
+    mat_44(const vec_xyzw& c0, const vec_xyzw& c1, const vec_xyzw& c2, const vec_xyzw& c3)
+        : col0(c0), col1(c1), col2(c2), col3(c3) {}
+
+    void set_zero()     { col0.set_zero(); col1.set_zero(); col2.set_zero(); col3.set_zero(); }
+    void set_identity() {
+        col0={1,0,0,0}; col1={0,1,0,0}; col2={0,0,1,0}; col3={0,0,0,1};
+    }
+    void set_scale(const vec_xyz& s) {
+        col0={s.x,0,0,0}; col1={0,s.y,0,0}; col2={0,0,s.z,0}; col3={0,0,0,1};
+    }
+    void set_translate(const vec_xyz& t) {
+        col0={1,0,0,0}; col1={0,1,0,0}; col2={0,0,1,0}; col3={t.x,t.y,t.z,1};
+    }
+
+    void set_row0(const vec_xyzw& r) { col0.x=r.x; col1.x=r.y; col2.x=r.z; col3.x=r.w; }
+    void set_row1(const vec_xyzw& r) { col0.y=r.x; col1.y=r.y; col2.y=r.z; col3.y=r.w; }
+    void set_row2(const vec_xyzw& r) { col0.z=r.x; col1.z=r.y; col2.z=r.z; col3.z=r.w; }
+    void set_row3(const vec_xyzw& r) { col0.w=r.x; col1.w=r.y; col2.w=r.z; col3.w=r.w; }
+
+    vec_xyzw get_row0() const { return {col0.x, col1.x, col2.x, col3.x}; }
+    vec_xyzw get_row1() const { return {col0.y, col1.y, col2.y, col3.y}; }
+    vec_xyzw get_row2() const { return {col0.z, col1.z, col2.z, col3.z}; }
+    vec_xyzw get_row3() const { return {col0.w, col1.w, col2.w, col3.w}; }
+
+    vec_xyzw get_col0() const { return col0; }
+    vec_xyzw get_col1() const { return col1; }
+    vec_xyzw get_col2() const { return col2; }
+    vec_xyzw get_col3() const { return col3; }
+
+    // M * v
+    vec_xyzw operator*(const vec_xyzw& v) const {
+        vec_xyzw r;
+        r.x = col0.x*v.x + col1.x*v.y + col2.x*v.z + col3.x*v.w;
+        r.y = col0.y*v.x + col1.y*v.y + col2.y*v.z + col3.y*v.w;
+        r.z = col0.z*v.x + col1.z*v.y + col2.z*v.z + col3.z*v.w;
+        r.w = col0.w*v.x + col1.w*v.y + col2.w*v.z + col3.w*v.w;
+        return r;
+    }
+
+    // Transpose-multiply: M^T * v
+    vec_xyzw trans_mult(const vec_xyzw& v) const {
+        return {col0.dot(v), col1.dot(v), col2.dot(v), col3.dot(v)};
+    }
+    vec_xyzw trans_mult(const vector_t& v) const {
+        vec_xyzw v4(v.x, v.y, v.z, 0.0f);
+        return trans_mult(v4);
+    }
+    vec_xyzw trans_mult(const point_t& v) const {
+        vec_xyzw v4(v.x, v.y, v.z, 1.0f);
+        return trans_mult(v4);
+    }
+
+    mat_44 transpose() const {
+        mat_44 r;
+        r.col0={col0.x,col1.x,col2.x,col3.x};
+        r.col1={col0.y,col1.y,col2.y,col3.y};
+        r.col2={col0.z,col1.z,col2.z,col3.z};
+        r.col3={col0.w,col1.w,col2.w,col3.w};
+        return r;
+    }
+
+    mat_44 operator-() const {
+        mat_44 r; r.col0=-col0; r.col1=-col1; r.col2=-col2; r.col3=-col3; return r;
+    }
+    mat_44 operator+(const mat_44& m) const {
+        mat_44 r; r.col0=col0+m.col0; r.col1=col1+m.col1;
+                  r.col2=col2+m.col2; r.col3=col3+m.col3; return r;
+    }
+    mat_44 operator-(const mat_44& m) const {
+        mat_44 r; r.col0=col0-m.col0; r.col1=col1-m.col1;
+                  r.col2=col2-m.col2; r.col3=col3-m.col3; return r;
+    }
+
+    // mat_44 * mat_44
+    mat_44 operator*(const mat_44& m) const {
+        mat_44 r;
+        r.col0 = *this * m.col0;
+        r.col1 = *this * m.col1;
+        r.col2 = *this * m.col2;
+        r.col3 = *this * m.col3;
+        return r;
+    }
+
+    // Transpose-multiply: M^T * A
+    mat_44 trans_mult(const mat_44& m) const {
+        mat_44 r;
+        r.col0 = trans_mult(m.col0);
+        r.col1 = trans_mult(m.col1);
+        r.col2 = trans_mult(m.col2);
+        r.col3 = trans_mult(m.col3);
+        return r;
+    }
+
+    void set_rotate_x(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={1,0,0,0}; col1={0,c,s,0}; col2={0,-s,c,0}; col3={0,0,0,1};
+    }
+    void set_rotate_y(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={c,0,-s,0}; col1={0,1,0,0}; col2={s,0,c,0}; col3={0,0,0,1};
+    }
+    void set_rotate_z(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={c,s,0,0}; col1={-s,c,0,0}; col2={0,0,1,0}; col3={0,0,0,1};
+    }
+
+    // mat_44 * mat_43 = mat_43
+    // result.col_i = (mat_44) * (mat_43).col_i   [mat_44 * vec_4]
+    mat_43 operator*(const mat_43& m) const {
+        mat_43 r;
+        r.col0 = *this * m.col0;
+        r.col1 = *this * m.col1;
+        r.col2 = *this * m.col2;
+        return r;
+    }
+};
+
+// Row-vector * mat_44: vec_4^T * mat_44 = vec_4
+inline vec_xyzw operator*(const vec_xyzw& v, const mat_44& m) {
+    return {v.dot(m.col0), v.dot(m.col1), v.dot(m.col2), v.dot(m.col3)};
+}
+
+// mat_34 * mat_44 = mat_34 (defined here now that mat_44 is complete)
+inline mat_34 mat_34_mult_mat_44(const mat_34& a, const mat_44& b) {
+    // result.col_i = a * b.col_i   [mat_34 * vec_4]
+    mat_34 r;
+    r.col0 = a * b.col0;
+    r.col1 = a * b.col1;
+    r.col2 = a * b.col2;
+    r.col3 = a * b.col3;
+    return r;
+}
+
+/* ========================================================================
+ * mat_43::transpose / mat_34::transpose – deferred method bodies
+ * ======================================================================== */
+
+// These must live outside the struct bodies since both mat_43 and mat_34
+// need to be complete before defining each other's transpose.
+
+/* ========================================================================
+ * transform_t  (rigid body transform: 3×3 rotation + 3D translation)
+ *
+ * Stored as 4×4 column-major matrix:
+ *   col0, col1, col2 = rotation columns (w=0)
+ *   col3             = translation (w=1)
+ * ======================================================================== */
+
+struct transform_t {
+    vec_xyzw col0, col1, col2, col3;
+
+    transform_t() { set_identity(); }
+
+    void set_identity() {
+        col0={1,0,0,0}; col1={0,1,0,0}; col2={0,0,1,0}; col3={0,0,0,1};
+    }
+    void set_zero() {
+        col0.set_zero(); col1.set_zero(); col2.set_zero(); col3.set_zero();
+    }
+
+    // Row access (includes translation column)
+    vec_xyzw get_row0() const { return {col0.x, col1.x, col2.x, col3.x}; }
+    vec_xyzw get_row1() const { return {col0.y, col1.y, col2.y, col3.y}; }
+    vec_xyzw get_row2() const { return {col0.z, col1.z, col2.z, col3.z}; }
+    vec_xyzw get_row3() const { return {col0.w, col1.w, col2.w, col3.w}; }
+
+    // Apply to 4D vector: result = R*v.xyz + t*v.w
+    vec_xyzw operator*(const vec_xyzw& v) const {
+        vec_xyzw r;
+        r.x = col0.x*v.x + col1.x*v.y + col2.x*v.z + col3.x*v.w;
+        r.y = col0.y*v.x + col1.y*v.y + col2.y*v.z + col3.y*v.w;
+        r.z = col0.z*v.x + col1.z*v.y + col2.z*v.z + col3.z*v.w;
+        r.w = col0.w*v.x + col1.w*v.y + col2.w*v.z + col3.w*v.w;
+        return r;
+    }
+
+    // Apply to direction (w=0): rotation only
+    vector_t operator*(const vector_t& v) const {
+        return { col0.x*v.x + col1.x*v.y + col2.x*v.z,
+                 col0.y*v.x + col1.y*v.y + col2.y*v.z,
+                 col0.z*v.x + col1.z*v.y + col2.z*v.z };
+    }
+
+    // Apply to point (w=1): rotation + translation
+    point_t operator*(const point_t& v) const {
+        return { col0.x*v.x + col1.x*v.y + col2.x*v.z + col3.x,
+                 col0.y*v.x + col1.y*v.y + col2.y*v.z + col3.y,
+                 col0.z*v.x + col1.z*v.y + col2.z*v.z + col3.z };
+    }
+
+    // Compose transforms
+    transform_t operator*(const transform_t& b) const {
+        transform_t r;
+        r.col0 = {col0.x*b.col0.x + col1.x*b.col0.y + col2.x*b.col0.z,
+                  col0.y*b.col0.x + col1.y*b.col0.y + col2.y*b.col0.z,
+                  col0.z*b.col0.x + col1.z*b.col0.y + col2.z*b.col0.z,
+                  0.0f};
+        r.col1 = {col0.x*b.col1.x + col1.x*b.col1.y + col2.x*b.col1.z,
+                  col0.y*b.col1.x + col1.y*b.col1.y + col2.y*b.col1.z,
+                  col0.z*b.col1.x + col1.z*b.col1.y + col2.z*b.col1.z,
+                  0.0f};
+        r.col2 = {col0.x*b.col2.x + col1.x*b.col2.y + col2.x*b.col2.z,
+                  col0.y*b.col2.x + col1.y*b.col2.y + col2.y*b.col2.z,
+                  col0.z*b.col2.x + col1.z*b.col2.y + col2.z*b.col2.z,
+                  0.0f};
+        r.col3 = {col0.x*b.col3.x + col1.x*b.col3.y + col2.x*b.col3.z + col3.x,
+                  col0.y*b.col3.x + col1.y*b.col3.y + col2.y*b.col3.z + col3.y,
+                  col0.z*b.col3.x + col1.z*b.col3.y + col2.z*b.col3.z + col3.z,
+                  1.0f};
+        return r;
+    }
+
+    // Orthonormal inverse: inv = { R^T, -R^T*t, 0,0,0,1 }
+    // inv.col_i (rotation part) = (col0[i], col1[i], col2[i], 0)  [transpose of rotation cols]
+    // inv.col3 = (-dot(col0.xyz, col3.xyz), -dot(col1.xyz, col3.xyz), -dot(col2.xyz, col3.xyz), 1)
+    transform_t orthonormal_inverse() const {
+        transform_t r;
+        // Transposed rotation
+        r.col0 = {col0.x, col1.x, col2.x, 0.0f};
+        r.col1 = {col0.y, col1.y, col2.y, 0.0f};
+        r.col2 = {col0.z, col1.z, col2.z, 0.0f};
+        // Translation: -R^T * t
+        float tx = col3.x, ty = col3.y, tz = col3.z;
+        r.col3 = {-(col0.x*tx + col0.y*ty + col0.z*tz),
+                  -(col1.x*tx + col1.y*ty + col1.z*tz),
+                  -(col2.x*tx + col2.y*ty + col2.z*tz),
+                  1.0f};
+        return r;
+    }
+
+    void orthonormal_inverse_in_place() {
+        *this = orthonormal_inverse();
+    }
+
+    // General inverse (uses orthonormal_inverse since transform_t is always rigid)
+    transform_t inverse() const {
+        return orthonormal_inverse();
+    }
+
+    void set_rotation(const mat_33& rot) {
+        col0 = {rot.col0.x, rot.col0.y, rot.col0.z, 0.0f};
+        col1 = {rot.col1.x, rot.col1.y, rot.col1.z, 0.0f};
+        col2 = {rot.col2.x, rot.col2.y, rot.col2.z, 0.0f};
+    }
+    void set_translation(const vec_xyz& t) {
+        col3 = {t.x, t.y, t.z, 1.0f};
+    }
+    void set_rotate_x(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={1,0,0,0}; col1={0,c,s,0}; col2={0,-s,c,0}; col3={0,0,0,1};
+    }
+    void set_rotate_y(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={c,0,-s,0}; col1={0,1,0,0}; col2={s,0,c,0}; col3={0,0,0,1};
+    }
+    void set_rotate_z(float a) {
+        float c=cosf(a), s=sinf(a);
+        col0={c,s,0,0}; col1={-s,c,0,0}; col2={0,0,1,0}; col3={0,0,0,1};
+    }
+};
+
+/* ========================================================================
+ * Deferred member function bodies (all types now complete)
+ * ======================================================================== */
+
+// mat_43::transpose() -> mat_34
+inline mat_34 mat_43::transpose() const { return mat_43_to_mat_34(*this); }
+
+// mat_34::transpose() -> mat_43
+inline mat_43 mat_34::transpose() const { return mat_34_to_mat_43(*this); }
+
+// mat_34 * mat_44 -> mat_34
+inline mat_34 mat_34::operator*(const mat_44& m) const {
+    return mat_34_mult_mat_44(*this, m);
+}
+
diff --git a/include/ps2s/cpu_matrix.h b/include/ps2s/cpu_matrix.h
index 3655312..a86b422 100644
--- a/include/ps2s/cpu_matrix.h
+++ b/include/ps2s/cpu_matrix.h
@@ -80,6 +80,7 @@ class cpu_mat_44 {
 inline cpu_vec_4
     cpu_mat_44::operator*(const cpu_vec_4& rhs) const
 {
+#if defined(NO_ASM) || defined(NO_VU0_VECTORS)
     cpu_vec_4 result;
 
     cpu_vec_4 row0(col0(0), col1(0), col2(0), col3(0));
@@ -95,11 +96,33 @@ inline cpu_vec_4
     result[3] = row3.dot(rhs);
 
     return result;
+#else
+    // VU0 macro-mode: result = col0*rhs.x + col1*rhs.y + col2*rhs.z + col3*rhs.w
+    // Requires cpu_vec_4 / cpu_mat_44 to be 16-byte aligned (enforced via alignas).
+    cpu_vec_4 result;
+    asm volatile(
+        " ### cpu_mat_44 * cpu_vec_4 ### \n"
+        "lqc2     $vf1,  0(%[M])    \n"
+        "lqc2     $vf2, 16(%[M])    \n"
+        "lqc2     $vf3, 32(%[M])    \n"
+        "lqc2     $vf4, 48(%[M])    \n"
+        "lqc2     $vf5,  0(%[V])    \n"
+        "vmulax    $ACC, $vf1, $vf5 \n"
+        "vmadday   $ACC, $vf2, $vf5 \n"
+        "vmaddaz   $ACC, $vf3, $vf5 \n"
+        "vmaddw    $vf6, $vf4, $vf5 \n"
+        "sqc2     $vf6,  0(%[R])    \n"
+        :
+        : [M] "r"(this), [V] "r"(&rhs), [R] "r"(&result)
+        : "memory");
+    return result;
+#endif
 }
 
 inline cpu_mat_44
     cpu_mat_44::operator*(const cpu_mat_44& rhs) const
 {
+#if defined(NO_ASM) || defined(NO_VU0_VECTORS)
     cpu_mat_44 result;
 
     result.col0 = *this * rhs.get_col0();
@@ -108,6 +131,48 @@ inline cpu_mat_44
     result.col3 = *this * rhs.get_col3();
 
     return result;
+#else
+    // VU0 macro-mode: load this once into vf1..vf4, apply to each col of rhs.
+    cpu_mat_44 result;
+    asm volatile(
+        " ### cpu_mat_44 * cpu_mat_44 ### \n"
+        "lqc2     $vf1,  0(%[A])    \n"
+        "lqc2     $vf2, 16(%[A])    \n"
+        "lqc2     $vf3, 32(%[A])    \n"
+        "lqc2     $vf4, 48(%[A])    \n"
+
+        "lqc2     $vf5,  0(%[B])    \n"
+        "vmulax    $ACC, $vf1, $vf5 \n"
+        "vmadday   $ACC, $vf2, $vf5 \n"
+        "vmaddaz   $ACC, $vf3, $vf5 \n"
+        "vmaddw    $vf6, $vf4, $vf5 \n"
+        "sqc2     $vf6,  0(%[R])    \n"
+
+        "lqc2     $vf5, 16(%[B])    \n"
+        "vmulax    $ACC, $vf1, $vf5 \n"
+        "vmadday   $ACC, $vf2, $vf5 \n"
+        "vmaddaz   $ACC, $vf3, $vf5 \n"
+        "vmaddw    $vf6, $vf4, $vf5 \n"
+        "sqc2     $vf6, 16(%[R])    \n"
+
+        "lqc2     $vf5, 32(%[B])    \n"
+        "vmulax    $ACC, $vf1, $vf5 \n"
+        "vmadday   $ACC, $vf2, $vf5 \n"
+        "vmaddaz   $ACC, $vf3, $vf5 \n"
+        "vmaddw    $vf6, $vf4, $vf5 \n"
+        "sqc2     $vf6, 32(%[R])    \n"
+
+        "lqc2     $vf5, 48(%[B])    \n"
+        "vmulax    $ACC, $vf1, $vf5 \n"
+        "vmadday   $ACC, $vf2, $vf5 \n"
+        "vmaddaz   $ACC, $vf3, $vf5 \n"
+        "vmaddw    $vf6, $vf4, $vf5 \n"
+        "sqc2     $vf6, 48(%[R])    \n"
+        :
+        : [A] "r"(this), [B] "r"(&rhs), [R] "r"(&result)
+        : "memory");
+    return result;
+#endif
 }
 
 #endif // ps2s_cpu_matrix_h
diff --git a/include/ps2s/cpu_vector.h b/include/ps2s/cpu_vector.h
index 4da0508..f0de4ed 100644
--- a/include/ps2s/cpu_vector.h
+++ b/include/ps2s/cpu_vector.h
@@ -70,8 +70,8 @@ class cpu_vec_3 {
             "mtc1    _temp0, _z          # z = value.z \n"
 
             ".endif \n"
-            : "=&r,f _x"(x), "=&r,f _y"(y), "=&r,f _z"(z), "=&r,&r _temp0"(temp0)
-            : "r,r _vec"(vec));
+            : "=&r,&f"(x), "=&r,&f"(y), "=&r,&f"(z), "=&r"(temp0)
+            : "r"(vec));
     }
 
     explicit inline cpu_vec_3(const vec_3 vec) { set(vec.vec128); }
@@ -156,7 +156,7 @@ typedef cpu_vec_3 cpu_vec_xyz;
  * cpu_vec_4 (or cpu_vec_xyzw)
  */
 
-class cpu_vec_4 {
+class alignas(16) cpu_vec_4 {
 public:
     float x, y, z, w;
 
@@ -215,8 +215,8 @@ class cpu_vec_4 {
             "mtc1    _temp0, _w          # w = value.w \n"
 
             ".endif \n"
-            : "=&r,f _x"(x), "=&r,f _y"(y), "=&r,f _z"(z), "=&r,f _w"(w), "=&r,&r _temp0"(temp0)
-            : "r,r _vec"(vec));
+            : "=&r,&f"(x), "=&r,&f"(y), "=&r,&f"(z), "=&r,&f"(w), "=&r"(temp0)
+            : "r"(vec));
     }
 
     explicit inline cpu_vec_4(const vec_4 vec) { set(vec.vec128); }
@@ -335,12 +335,12 @@ cpu_vec_3::operator+(const cpu_vec_3& vec)
 #else
 
     asm(" ### cpu_vec_3 + cpu_vec_3 ### \n"
-        "add.s rx, v0x, v1x \n"
-        "add.s ry, v0y, v1y \n"
-        "add.s rz, v0z, v1z \n"
-        : "=&f rx"(result.x), "=&f ry"(result.y), "=&f rz"(result.z)
-        : "f v0x"(x), "f v0y"(y), "f v0z"(z),
-        "f v1x"(vec.x), "f v1y"(vec.y), "f v1z"(vec.z));
+        "add.s %[rx], %[v0x], %[v1x] \n"
+        "add.s %[ry], %[v0y], %[v1y] \n"
+        "add.s %[rz], %[v0z], %[v1z] \n"
+        : [rx] "=&f"(result.x), [ry] "=&f"(result.y), [rz] "=&f"(result.z)
+        : [v0x] "f"(x), [v0y] "f"(y), [v0z] "f"(z),
+        [v1x] "f"(vec.x), [v1y] "f"(vec.y), [v1z] "f"(vec.z));
 
 #endif
     return result;
@@ -359,12 +359,12 @@ cpu_vec_3::operator-(const cpu_vec_3& vec)
 #else
 
     asm(" ### cpu_vec_3 - cpu_vec_3 ### \n"
-        "sub.s rx, v0x, v1x \n"
-        "sub.s ry, v0y, v1y \n"
-        "sub.s rz, v0z, v1z \n"
-        : "=&f rx"(result.x), "=&f ry"(result.y), "=&f rz"(result.z)
-        : "f v0x"(x), "f v0y"(y), "f v0z"(z),
-        "f v1x"(vec.x), "f v1y"(vec.y), "f v1z"(vec.z));
+        "sub.s %[rx], %[v0x], %[v1x] \n"
+        "sub.s %[ry], %[v0y], %[v1y] \n"
+        "sub.s %[rz], %[v0z], %[v1z] \n"
+        : [rx] "=&f"(result.x), [ry] "=&f"(result.y), [rz] "=&f"(result.z)
+        : [v0x] "f"(x), [v0y] "f"(y), [v0z] "f"(z),
+        [v1x] "f"(vec.x), [v1y] "f"(vec.y), [v1z] "f"(vec.z));
 
 #endif
     return result;
@@ -488,12 +488,21 @@ inline cpu_vec_4
 cpu_vec_4::operator+(const cpu_vec_4& vec)
 {
     cpu_vec_4 result;
-
+#if defined(NO_ASM) || defined(NO_VU0_VECTORS)
     result.x = x + vec.x;
     result.y = y + vec.y;
     result.z = z + vec.z;
     result.w = w + vec.w;
-
+#else
+    asm volatile(" ### cpu_vec_4 + cpu_vec_4 (VU0) ### \n"
+                 "lqc2  $vf1, 0(%[A]) \n"
+                 "lqc2  $vf2, 0(%[B]) \n"
+                 "vadd.xyzw $vf3, $vf1, $vf2 \n"
+                 "sqc2  $vf3, 0(%[R]) \n"
+        :
+        : [A] "r"(this), [B] "r"(&vec), [R] "r"(&result)
+        : "memory");
+#endif
     return result;
 }
 
@@ -501,12 +510,21 @@ inline cpu_vec_4
 cpu_vec_4::operator-(const cpu_vec_4& vec)
 {
     cpu_vec_4 result;
-
+#if defined(NO_ASM) || defined(NO_VU0_VECTORS)
     result.x = x - vec.x;
     result.y = y - vec.y;
     result.z = z - vec.z;
     result.w = w - vec.w;
-
+#else
+    asm volatile(" ### cpu_vec_4 - cpu_vec_4 (VU0) ### \n"
+                 "lqc2  $vf1, 0(%[A]) \n"
+                 "lqc2  $vf2, 0(%[B]) \n"
+                 "vsub.xyzw $vf3, $vf1, $vf2 \n"
+                 "sqc2  $vf3, 0(%[R]) \n"
+        :
+        : [A] "r"(this), [B] "r"(&vec), [R] "r"(&result)
+        : "memory");
+#endif
     return result;
 }
 
@@ -514,12 +532,21 @@ inline cpu_vec_4
 cpu_vec_4::operator-()
 {
     cpu_vec_4 result;
-
+#if defined(NO_ASM) || defined(NO_VU0_VECTORS)
     result.x = -x;
     result.y = -y;
     result.z = -z;
     result.w = -w;
-
+#else
+    asm volatile(" ### -cpu_vec_4 (VU0) ### \n"
+                 "lqc2  $vf1, 0(%[A]) \n"
+                 "vsub.xyzw $vf2, $vf1, $vf1 \n"   // vf2 = 0 (vf0.w is 1.0, can't use it)
+                 "vsub.xyzw $vf3, $vf2, $vf1 \n"   // vf3 = 0 - v
+                 "sqc2  $vf3, 0(%[R]) \n"
+        :
+        : [A] "r"(this), [R] "r"(&result)
+        : "memory");
+#endif
     return result;
 }
 
@@ -527,12 +554,21 @@ inline cpu_vec_4
     cpu_vec_4::operator*(const cpu_vec_4& vec)
 {
     cpu_vec_4 result;
-
+#if defined(NO_ASM) || defined(NO_VU0_VECTORS)
     result.x = x * vec.x;
     result.y = y * vec.y;
     result.z = z * vec.z;
     result.w = w * vec.w;
-
+#else
+    asm volatile(" ### cpu_vec_4 * cpu_vec_4 (VU0) ### \n"
+                 "lqc2  $vf1, 0(%[A]) \n"
+                 "lqc2  $vf2, 0(%[B]) \n"
+                 "vmul.xyzw $vf3, $vf1, $vf2 \n"
+                 "sqc2  $vf3, 0(%[R]) \n"
+        :
+        : [A] "r"(this), [B] "r"(&vec), [R] "r"(&result)
+        : "memory");
+#endif
     return result;
 }
 
diff --git a/include/ps2s/matrix.h b/include/ps2s/matrix.h
index 2fd63bb..df6aee6 100644
--- a/include/ps2s/matrix.h
+++ b/include/ps2s/matrix.h
@@ -51,70 +51,82 @@ class mat_33 : public mat_x3_template<vec_3> {
 
     void set_identity()
     {
-        asm(" ### mat_33::set_identity ### \n"
-            "vsub		%[col0], %[col0], %[col0] \n"
-            "vsub		%[col1], %[col1], %[col1] \n"
-            "vmr32		%[col2], vf00             \n"
-            "vaddw.x		%[col0], vf00, vf00       \n"
-            "vaddw.y		%[col1], vf00, vf00       \n"
-            : [col0] "=j"(col0),
-            [col1] "=j"(col1),
-            [col2] "=j"(col2));
+        asm __volatile__(
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "vsub       $vf2, $vf0, $vf0         \n"
+            "vmr32      $vf3, $vf0               \n"
+            "vaddw.x    $vf1, $vf1, $vf0         \n"
+            "vaddw.y    $vf2, $vf2, $vf0         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128)
+            :
+            : "memory");
     }
 
     void
     set_scale(vec_3 scale)
     {
         set_zero();
-        col0 = (vec_x)scale;
-        col1 = (vec_y)scale;
-        col2 = (vec_z)scale;
+        // Extract components and set diagonal elements using VU0
+        float sx = (float)vec_x(scale);
+        float sy = (float)vec_y(scale);
+        float sz = (float)vec_z(scale);
+        asm __volatile__(
+            "ctc2       %[sx], $vi21              \n"
+            "vaddi.x    $vf1, $vf0, $I           \n"
+            "ctc2       %[sy], $vi21              \n"
+            "vaddi.y    $vf2, $vf0, $I           \n"
+            "ctc2       %[sz], $vi21              \n"
+            "vaddi.z    $vf3, $vf0, $I           \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128)
+            : [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz)
+            : "memory");
     }
 
     void
     set(vec_4 quat)
     {
-        vec128_t ones, q_yzx, q_zxy;
-
-        asm("### set rotation from unit quaternion ### \n"
-
-            "vmr32.xy	%[q_yzx], %[quat] \n"
-            "vaddx.z		%[q_yzx], vf00, %[quat] \n"
-            "vaddz.x		%[q_zxy], vf00, %[quat] \n"
-            "vaddx.y		%[q_zxy], vf00, %[quat] \n"
-            "vaddy.z		%[q_zxy], vf00, %[quat] \n"
-
-            "vmula		ACC, %[quat], %[q_yzx] \n"
-            "vmsubw.z	%[col0], %[q_zxy], %[quat] \n"
-            "vmsubw.x	%[col1], %[q_zxy], %[quat] \n"
-            "vmsubw.y	%[col2], %[q_zxy], %[quat] \n"
-
-            "vmula		ACC, %[quat], %[q_zxy] \n"
-            "vmaddw.y	%[col0], %[q_yzx], %[quat] \n"
-            "vmaddw.z	%[col1], %[q_yzx], %[quat] \n"
-            "vmaddw.x	%[col2], %[q_yzx], %[quat] \n"
-
-            "vmula		ACC, %[q_zxy], %[q_zxy] \n"
-            "vmadd.x		%[col0], %[q_yzx], %[q_yzx] \n"
-            "vmadd.y		%[col1], %[q_yzx], %[q_yzx] \n"
-            "vmadd.z		%[col2], %[q_yzx], %[q_yzx] \n"
-
-            "vmaxw		%[ones], vf00, vf00 \n"
-            "vadd		%[col0], %[col0], %[col0] \n"
-            "vadd		%[col1], %[col1], %[col1] \n"
-            "vadd		%[col2], %[col2], %[col2] \n"
-
-            "vsub.x		%[col0], %[ones], %[col0] \n"
-            "vsub.y		%[col1], %[ones], %[col1] \n"
-            "vsub.z		%[col2], %[ones], %[col2] \n"
-
-            : [col0] "=&j"(col0),
-            [col1] "=&j"(col1),
-            [col2] "=&j"(col2),
-            [q_yzx] "=&j"(q_yzx),
-            [q_zxy] "=&j"(q_zxy),
-            [ones] "=&j"(ones), "=r"(vu0_ACC)
-            : [quat] "j"(quat));
+        vec128_t qv = quat.vec128;
+        vec128_t col0_v128, col1_v128, col2_v128;
+        asm __volatile__ ("### set rotation from unit quaternion ### \n"
+            "qmtc2 %[quat], $vf1                    \n"
+            "vmr32.xy $vf2, $vf1                    \n"
+            "vaddx.z  $vf2, $vf0, $vf1             \n"
+            "vaddz.x  $vf3, $vf0, $vf1             \n"
+            "vaddx.y  $vf3, $vf0, $vf1             \n"
+            "vaddy.z  $vf3, $vf0, $vf1             \n"
+            "vmula $ACC, $vf1, $vf2                 \n"
+            "vmsubw.z $vf4, $vf3, $vf1             \n"
+            "vmsubw.x $vf5, $vf3, $vf1             \n"
+            "vmsubw.y $vf6, $vf3, $vf1             \n"
+            "vmula $ACC, $vf1, $vf3                 \n"
+            "vmaddw.y $vf4, $vf2, $vf1             \n"
+            "vmaddw.z $vf5, $vf2, $vf1             \n"
+            "vmaddw.x $vf6, $vf2, $vf1             \n"
+            "vmula $ACC, $vf3, $vf3                 \n"
+            "vmadd.x  $vf4, $vf2, $vf2             \n"
+            "vmadd.y  $vf5, $vf2, $vf2             \n"
+            "vmadd.z  $vf6, $vf2, $vf2             \n"
+            "vmaxw $vf7, $vf0, $vf0                 \n"
+            "vadd $vf4, $vf4, $vf4                  \n"
+            "vadd $vf5, $vf5, $vf5                  \n"
+            "vadd $vf6, $vf6, $vf6                  \n"
+            "vsub.x $vf4, $vf7, $vf4               \n"
+            "vsub.y $vf5, $vf7, $vf5               \n"
+            "vsub.z $vf6, $vf7, $vf6               \n"
+            "qmfc2 %[c0], $vf4                      \n"
+            "qmfc2 %[c1], $vf5                      \n"
+            "qmfc2 %[c2], $vf6                      \n"
+            : [c0] "=r"(col0_v128), [c1] "=r"(col1_v128), [c2] "=r"(col2_v128), "=r"(vu0_ACC)
+            : [quat] "r"(qv));
+        col0.vec128 = col0_v128;
+        col1.vec128 = col1_v128;
+        col2.vec128 = col2_v128;
     }
 
     mat_33(vec_4 quat)
@@ -219,25 +231,26 @@ class mat_33 : public mat_x3_template<vec_3> {
         vec128_t result, temp0, temp1, temp2, ones;
 
         asm("### mat_33 trans_mult vec_3 ### \n"
-            "vmul		%[temp0], col0], %[vec] \n"
-            "vmaxw		%[ones], vf00, vf00 \n"
-            "vmul		%[temp1], %[col1], %[vec] \n"
-            "vmul		%[temp2], %[col2], %[vec] \n"
-            "vadday.x	ACC, %[temp0], %[temp0] \n"
-            "vmaddz.x	%[result], %[ones], %[temp0] \n"
-            "vaddax.y	ACC, %[temp1], %[temp1] \n"
-            "vmaddz.y	%[result], %[ones], %[temp1] \n"
-            "vaddax.z	ACC, %[temp2], %[temp2] \n"
-            "vmaddy.z	%[result], %[ones], %[temp2] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0),
-            [temp1] "=&j"(temp1),
-            [temp2] "=&j"(temp2),
-            [ones] "=&j"(ones), "=r"(vu0_ACC)
-            : "j col0"(col0),
-            "j col1"(col1),
-            "j col2"(col2),
-            "j vec"(vec));
+            "qmtc2      %[col0], $vf10        \n"
+            "qmtc2      %[col1], $vf11        \n"
+            "qmtc2      %[col2], $vf12        \n"
+            "qmtc2      %[vec], $vf13         \n"
+            "vmul       $vf14, $vf10, $vf13   \n"
+            "vmaxw      $vf15, $vf0, $vf0     \n"
+            "vmul       $vf16, $vf11, $vf13   \n"
+            "vmul       $vf17, $vf12, $vf13   \n"
+            "vadday.x   $ACC, $vf14, $vf14    \n"
+            "vmaddz.x   $vf14, $vf15, $vf14   \n"
+            "vaddax.y   $ACC, $vf16, $vf16    \n"
+            "vmaddz.y   $vf14, $vf15, $vf16   \n"
+            "vaddax.z   $ACC, $vf17, $vf17    \n"
+            "vmaddy.z   $vf14, $vf15, $vf17   \n"
+            "qmfc2      %[result], $vf14      \n"
+            : [result] "=&r"(result), "=r"(vu0_ACC)
+            : [col0] "r"(col0.vec128),
+            [col1] "r"(col1.vec128),
+            [col2] "r"(col2.vec128),
+            [vec] "r"(vec.vec128));
 
         return vec_3(result);
     }
@@ -383,81 +396,102 @@ class mat_43 : public mat_x3_template<vec_4> {
     vec_3
     trans_mult(vec_4 vec) const
     {
-        vec128_t result, temp0, temp1, temp2, ones;
-
-        asm("### mat_43 trans_mult vec_4 ### \n"
-            "vmul		%[temp0], %[col0], %[vec] \n"
-            "vmaxw		%[ones], vf00, vf00 \n"
-            "vmul		%[temp1], %[col1], %[vec] \n"
-            "vmul		%[temp2], %[col2], %[vec] \n"
-            "vadday.x	ACC, %[temp0], %[temp0] \n"
-            "vmaddaz.x	ACC, %[ones], %[temp0] \n"
-            "vmaddw.x	%[result], %[ones], %[temp0] \n"
-            "vaddax.y	ACC, %[temp1], %[temp1] \n"
-            "vmaddaz.y	ACC, %[ones], %[temp1] \n"
-            "vmaddw.y	%[result], %[ones], %[temp1] \n"
-            "vaddax.z	ACC, %[temp2], %[temp2] \n"
-            "vmadday.z	ACC, %[ones], %[temp2] \n"
-            "vmaddw.z	%[result], %[ones], %[temp2] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [ones] "=&j"(ones), "=r"(vu0_ACC)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [vec] "j"(vec));
-
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_43 trans_mult vec_4 ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[vec_val], $vf4         \n"
+            "vmaxw      $vf5, $vf0, $vf0         \n"
+            "vmul       $vf6, $vf1, $vf4         \n"
+            "vmul       $vf7, $vf2, $vf4         \n"
+            "vmul       $vf8, $vf3, $vf4         \n"
+            "vadday.x   $ACC, $vf6, $vf6         \n"
+            "vmaddaz.x  $ACC, $vf5, $vf6         \n"
+            "vmaddw.x   $vf9, $vf5, $vf6         \n"
+            "vaddax.y   $ACC, $vf7, $vf7         \n"
+            "vmaddaz.y  $ACC, $vf5, $vf7         \n"
+            "vmaddw.y   $vf9, $vf5, $vf7         \n"
+            "vaddax.z   $ACC, $vf8, $vf8         \n"
+            "vmadday.z  $ACC, $vf5, $vf8         \n"
+            "vmaddw.z   $vf9, $vf5, $vf8         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_3(result);
     }
 
     vec_3
     trans_mult(vector_t vec) const
     {
-        vec128_t result, temp0, temp1, temp2, ones;
-
-        asm("### mat_43 trans_mult vector_t ### \n"
-            "vmul		%[temp0], %[col0], %[vec] \n"
-            "vmaxw		%[ones], vf00, vf00 \n"
-            "vmul		%[temp1], %[col1], %[vec] \n"
-            "vmul		%[temp2], %[col2], %[vec] \n"
-            "vadday.x	ACC, %[temp0], %[temp0] \n"
-            "vmaddz.x	%[result], %[ones], %[temp0] \n"
-            "vaddax.y	ACC, %[temp1], %[temp1] \n"
-            "vmaddz.y	%[result, %[ones], %[temp1] \n"
-            "vaddax.z	ACC, %[temp2], %[temp2] \n"
-            "vmaddy.z	%[result], %[ones], %[temp2] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [ones] "=&j"(ones), "=r"(vu0_ACC)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [vec] "j"(vec));
-
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_43 trans_mult vector_t ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[vec_val], $vf4         \n"
+            "vmaxw      $vf5, $vf0, $vf0         \n"
+            "vmul       $vf6, $vf1, $vf4         \n"
+            "vmul       $vf7, $vf2, $vf4         \n"
+            "vmul       $vf8, $vf3, $vf4         \n"
+            "vadday.x   $ACC, $vf6, $vf6         \n"
+            "vmaddz.x   $vf9, $vf5, $vf6         \n"
+            "vaddax.y   $ACC, $vf7, $vf7         \n"
+            "vmaddz.y   $vf9, $vf5, $vf7         \n"
+            "vaddax.z   $ACC, $vf8, $vf8         \n"
+            "vmaddy.z   $vf9, $vf5, $vf8         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_3(result);
     }
 
     vec_3
     trans_mult(point_t vec) const
     {
-        vec128_t result, temp0, temp1, temp2, ones;
-
-        asm("### mat_43 trans_mult point_t ### \n"
-            "vmul		%[temp0], %[col0], %[vec] \n"
-            "vmaxw		%[ones], vf00, vf00 \n"
-            "vmul		%[temp1], %[col1], %[vec] \n"
-            "vmul		%[temp2], %[col2], %[vec] \n"
-            "vadday.x	ACC, %[temp0], %[temp0] \n"
-            "vmaddaz.x	ACC, %[ones], %[temp0] \n"
-            "vmaddw.x	%[result], %[ones], %[col0] \n"
-            "vaddax.y	ACC, %[temp1], %[temp1] \n"
-            "vmaddaz.y	ACC, %[ones], %[temp1] \n"
-            "vmaddw.y	%[result], %[ones], %[col1] \n"
-            "vaddax.z	ACC, %[temp2], %[temp2] \n"
-            "vmadday.z	ACC, %[ones], %[temp2] \n"
-            "vmaddw.z	%[result], %[ones], %[col2] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [ones] "=&j"(ones), "=r"(vu0_ACC)
-            : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec));
-
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_43 trans_mult point_t ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[vec_val], $vf4         \n"
+            "vmaxw      $vf5, $vf0, $vf0         \n"
+            "vmul       $vf6, $vf1, $vf4         \n"
+            "vmul       $vf7, $vf2, $vf4         \n"
+            "vmul       $vf8, $vf3, $vf4         \n"
+            "vadday.x   $ACC, $vf6, $vf6         \n"
+            "vmaddaz.x  $ACC, $vf5, $vf6         \n"
+            "vmaddw.x   $vf9, $vf5, $vf1         \n"
+            "vaddax.y   $ACC, $vf7, $vf7         \n"
+            "vmaddaz.y  $ACC, $vf5, $vf7         \n"
+            "vmaddw.y   $vf9, $vf5, $vf2         \n"
+            "vaddax.z   $ACC, $vf8, $vf8         \n"
+            "vmadday.z  $ACC, $vf5, $vf8         \n"
+            "vmaddw.z   $vf9, $vf5, $vf3         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_3(result);
     }
 
@@ -622,26 +656,37 @@ class mat_34 : public mat_x4_template<vec_3> {
     vec_4
     trans_mult(vec_3 vec) const
     {
-        vec128_t result, temp0, temp1, temp2, temp3;
-
-        asm("### mat_34 trans_mult vec_3 ### \n"
-            "vmul	%[temp3], %[col3], %[vec] \n"
-            "vmul	%[temp0], %[col0], %[vec] \n"
-            "vmul	%[temp1], %[col1], %[vec] \n"
-            "vmul	%[temp2], %[col2], %[vec] \n"
-            "vmulx.w	%[temp3], vf00, %[temp3] \n"
-            "vaddy.x	%[temp0], %[temp0], %[temp0] \n"
-            "vaddx.y	%[temp1], %[temp1], %[temp1] \n"
-            "vaddx.z	%[temp2], %[temp2], %[temp2] \n"
-            "vaddy.w	%[temp3], %[temp3], %[temp3] \n"
-            "vaddz.x	%[result], %[temp0], %[temp0] \n"
-            "vaddz.y	%[result], %[temp1], %[temp1] \n"
-            "vaddy.z	%[result], %[temp2], %[temp2] \n"
-            "vaddz.w	%[result], %[temp3], %[temp3] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3)
-            : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [col3] "j"(col3), [vec] "j"(vec));
-
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_34 trans_mult vec_3 ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[vec_val], $vf5         \n"
+            "vmul       $vf7, $vf4, $vf5         \n"
+            "vmul       $vf6, $vf1, $vf5         \n"
+            "vmul       $vf8, $vf2, $vf5         \n"
+            "vmul       $vf9, $vf3, $vf5         \n"
+            "vmulx.w    $vf7, $vf0, $vf7         \n"
+            "vaddy.x    $vf6, $vf6, $vf6         \n"
+            "vaddx.y    $vf8, $vf8, $vf8         \n"
+            "vaddx.z    $vf9, $vf9, $vf9         \n"
+            "vaddy.w    $vf7, $vf7, $vf7         \n"
+            "vaddz.x    $vf10, $vf6, $vf6        \n"
+            "vaddz.y    $vf10, $vf8, $vf8        \n"
+            "vaddy.z    $vf10, $vf9, $vf9        \n"
+            "vaddz.w    $vf10, $vf7, $vf7        \n"
+            "qmfc2      %[result], $vf10         \n"
+            : [result] "=&r"(result)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_4(result);
     }
 
@@ -742,43 +787,91 @@ class mat_44 : public mat_x4_template<vec_4> {
 
     void set_identity()
     {
-        asm(" ### mat_44::set_identity ### \n"
-            "vsub	%[col0], %[col0], %[col0] \n"
-            "vsub	%[col1], %[col1], %[col1] \n"
-            "vmr32	%[col2], vf00 \n"
-            "vmove	%[col3], vf00 \n"
-            "vaddw.x	%[col0], vf00, vf00 \n"
-            "vaddw.y	%[col1], vf00, vf00 \n"
-            : [col0] "=j"(col0),
-            [col1] "=j"(col1),
-            [col2] "=j"(col2),
-            [col3] "=j"(col3));
+        asm __volatile__(
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "vsub       $vf2, $vf0, $vf0         \n"
+            "vmr32      $vf3, $vf0               \n"
+            "vmove      $vf4, $vf0               \n"
+            "vaddw.x    $vf1, $vf1, $vf0         \n"
+            "vaddw.y    $vf2, $vf2, $vf0         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
+            :
+            : "memory");
     }
 
     void
     set_scale(vec_3 scale)
     {
-        set_identity();
-        col0 = (vec_x)scale;
-        col1 = (vec_y)scale;
-        col2 = (vec_z)scale;
+        set_zero();
+        // Extract components and set diagonal elements using VU0
+        float sx = (float)vec_x(scale);
+        float sy = (float)vec_y(scale);
+        float sz = (float)vec_z(scale);
+        asm __volatile__(
+            "ctc2       %[sx], $vi21              \n"
+            "vaddi.x    $vf1, $vf0, $I           \n"
+            "ctc2       %[sy], $vi21              \n"
+            "vaddi.y    $vf2, $vf0, $I           \n"
+            "ctc2       %[sz], $vi21              \n"
+            "vaddi.z    $vf3, $vf0, $I           \n"
+            "vmove      $vf4, $vf0               \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
+            : [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz)
+            : "memory");
     }
 
     void
     set_scale(vec_4 scale)
     {
         set_identity();
-        col0 = (vec_x)scale;
-        col1 = (vec_y)scale;
-        col2 = (vec_z)scale;
-        col3 = (vec_w)scale;
+        // Extract components and set diagonal elements using VU0
+        float sx = (float)vec_x(scale);
+        float sy = (float)vec_y(scale);
+        float sz = (float)vec_z(scale);
+        float sw = (float)vec_w(scale);
+        asm __volatile__(
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "vsub       $vf2, $vf0, $vf0         \n"
+            "vsub       $vf3, $vf0, $vf0         \n"
+            "vsub       $vf4, $vf0, $vf0         \n"
+            "ctc2       %[sx], $vi21              \n"
+            "vaddi.x    $vf1, $vf0, $I           \n"
+            "ctc2       %[sy], $vi21              \n"
+            "vaddi.y    $vf2, $vf0, $I           \n"
+            "ctc2       %[sz], $vi21              \n"
+            "vaddi.z    $vf3, $vf0, $I           \n"
+            "ctc2       %[sw], $vi21              \n"
+            "vmuli.w    $vf4, $vf0, $I           \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
+            : [sx] "r"(sx), [sy] "r"(sy), [sz] "r"(sz), [sw] "r"(sw)
+            : "memory");
     }
 
     void
     set_translate(vec_3 xlate_amount)
     {
         set_identity();
-        col3 = xlate_amount;
+        // Set translation in col3, preserving w=1 from set_identity
+        float tx = (float)vec_x(xlate_amount);
+        float ty = (float)vec_y(xlate_amount);
+        float tz = (float)vec_z(xlate_amount);
+        col3.set_x(tx);
+        col3.set_y(ty);
+        col3.set_z(tz);
+        // w should already be 1.0 from set_identity, but ensure it
+        col3.set_w(1.0f);
     }
 
     void
@@ -882,95 +975,117 @@ class mat_44 : public mat_x4_template<vec_4> {
     vec_4
     trans_mult(vec_4 vec) const
     {
-        vec128_t result, temp0, temp1, temp2, temp3;
-
-        asm("### mat_44 trans_mult vec_4 ### \n"
-            "vmul	%[temp0], %[col0], %[vec] \n"
-            "vmul	%[temp1], %[col1], %[vec] \n"
-            "vmul	%[temp2], %[col2], %[vec] \n"
-            "vmul	%[temp3], %[col3], %[vec] \n"
-            "vaddy.x	%[temp0], %[temp0], %[temp0] \n"
-            "vaddx.y	%[temp1], %[temp1], %[temp1] \n"
-            "vaddx.z	%[temp2], %[temp2], %[temp2] \n"
-            "vaddx.w	%[temp3], %[temp3], %[temp3] \n"
-            "vaddz.x	%[temp0], %[temp0], %[temp0] \n"
-            "vaddz.y	%[temp1], %[temp1], %[temp1] \n"
-            "vaddy.z	%[temp2], %[temp2], %[temp2] \n"
-            "vaddy.w	%[temp3], %[temp3], %[temp3] \n"
-            "vaddw.x	%[result], %[temp0], %[temp0] \n"
-            "vaddw.y	%[result], %[temp1], %[temp1] \n"
-            "vaddw.z	%[result], %[temp2], %[temp2] \n"
-            "vaddz.w	%[result], %[temp3], %[temp3] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [col3] "j"(col3),
-            [vec] "j"(vec));
-
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_44 trans_mult vec_4 ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[vec_val], $vf5         \n"
+            "vmul       $vf6, $vf1, $vf5         \n"
+            "vmul       $vf7, $vf2, $vf5         \n"
+            "vmul       $vf8, $vf3, $vf5         \n"
+            "vmul       $vf9, $vf4, $vf5         \n"
+            "vaddy.x    $vf6, $vf6, $vf6         \n"
+            "vaddx.y    $vf7, $vf7, $vf7         \n"
+            "vaddx.z    $vf8, $vf8, $vf8         \n"
+            "vaddx.w    $vf9, $vf9, $vf9         \n"
+            "vaddz.x    $vf6, $vf6, $vf6         \n"
+            "vaddz.y    $vf7, $vf7, $vf7         \n"
+            "vaddy.z    $vf8, $vf8, $vf8         \n"
+            "vaddy.w    $vf9, $vf9, $vf9         \n"
+            "vaddw.x    $vf10, $vf6, $vf6        \n"
+            "vaddw.y    $vf10, $vf7, $vf7        \n"
+            "vaddw.z    $vf10, $vf8, $vf8        \n"
+            "vaddz.w    $vf10, $vf9, $vf9        \n"
+            "qmfc2      %[result], $vf10         \n"
+            : [result] "=&r"(result)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_4(result);
     }
 
     vec_4
     trans_mult(vector_t vec) const
     {
-        vec128_t result, temp0, temp1, temp2, temp3;
-
-        asm("### mat_44 trans_mult vector_t ### \n"
-            "vmul	%[temp3], %[col3], %[vec] \n"
-            "vmul	%[temp0], %[col0], %[vec] \n"
-            "vmul	%[temp1], %[col1], %[vec] \n"
-            "vmul	%[temp2], %[col2], %[vec] \n"
-            "vmulx.w	%[temp3], vf00, %[temp3] \n"
-            "vaddy.x	%[temp0], %[temp0], %[temp0] \n"
-            "vaddx.y	%[temp1], %[temp1], %[temp1] \n"
-            "vaddx.z	%[temp2], %[temp2], %[temp2] \n"
-            "vaddy.w	%[temp3], %[temp3], %[temp3] \n"
-            "vaddz.x	%[result], %[temp0], %[temp0] \n"
-            "vaddz.y	%[result], %[temp1], %[temp1] \n"
-            "vaddy.z	%[result], %[temp2], %[temp2] \n"
-            "vaddz.w	%[result], %[temp3], %[temp3] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [col3] "j"(col3),
-            [vec] "j"(vec));
-
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_44 trans_mult vector_t ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[vec_val], $vf5         \n"
+            "vmul       $vf9, $vf4, $vf5         \n"
+            "vmul       $vf6, $vf1, $vf5         \n"
+            "vmul       $vf7, $vf2, $vf5         \n"
+            "vmul       $vf8, $vf3, $vf5         \n"
+            "vmulx.w    $vf9, $vf0, $vf9         \n"
+            "vaddy.x    $vf6, $vf6, $vf6         \n"
+            "vaddx.y    $vf7, $vf7, $vf7         \n"
+            "vaddx.z    $vf8, $vf8, $vf8         \n"
+            "vaddy.w    $vf9, $vf9, $vf9         \n"
+            "vaddz.x    $vf10, $vf6, $vf6        \n"
+            "vaddz.y    $vf10, $vf7, $vf7        \n"
+            "vaddy.z    $vf10, $vf8, $vf8        \n"
+            "vaddz.w    $vf10, $vf9, $vf9        \n"
+            "qmfc2      %[result], $vf10         \n"
+            : [result] "=&r"(result)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_4(result);
     }
 
     vec_4
     trans_mult(point_t vec) const
     {
-        vec128_t result, temp0, temp1, temp2, temp3;
-
-        asm("### mat_44 trans_mult point_t ### \n"
-            "vmul	%[temp0], %[col0], %[vec] \n"
-            "vmul	%[temp1], %[col1], %[vec] \n"
-            "vmul	%[temp2], %[col2], %[vec] \n"
-            "vmul	%[temp3], %[col3], %[vec] \n"
-            "vaddy.x	%[temp0], %[temp0], %[temp0] \n"
-            "vaddx.y	%[temp1], %[temp1], %[temp1] \n"
-            "vaddx.z	%[temp2], %[temp2], %[temp2] \n"
-            "vaddx.w	%[temp3], %[col3], %[temp3] \n"
-            "vaddz.x	%[temp0], %[temp0], %[temp0] \n"
-            "vaddz.y	%[temp1], %[temp1], %[temp1] \n"
-            "vaddy.z	%[temp2], %[temp2], %[temp2] \n"
-            "vaddy.w	%[temp3], %[temp3], %[temp3] \n"
-            "vaddw.x	%[result], %[temp0], %[col0] \n"
-            "vaddw.y	%[result], %[temp1], %[col1] \n"
-            "vaddw.z	%[result], %[temp2], %[col2] \n"
-            "vaddz.w	%[result], %[temp3], %[temp3] \n"
-            : [result] "=&j"(result),
-            [temp0] "=&j"(temp0), [temp1] "=&j"(temp1), [temp2] "=&j"(temp2), [temp3] "=&j"(temp3)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [col3] "j"(col3),
-            [vec] "j"(vec));
+        vec128_t result;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t vec_val = vec.vec128;
+        asm __volatile__(
+            "### mat_44 trans_mult point_t ### \n"
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[vec_val], $vf5         \n"
+            "vmul       $vf6, $vf1, $vf5         \n"
+            "vmul       $vf7, $vf2, $vf5         \n"
+            "vmul       $vf8, $vf3, $vf5         \n"
+            "vmul       $vf9, $vf4, $vf5         \n"
+            "vaddy.x    $vf6, $vf6, $vf6         \n"
+            "vaddx.y    $vf7, $vf7, $vf7         \n"
+            "vaddx.z    $vf8, $vf8, $vf8         \n"
+            "vaddx.w    $vf9, $vf4, $vf9         \n"
+            "vaddz.x    $vf6, $vf6, $vf6         \n"
+            "vaddz.y    $vf7, $vf7, $vf7         \n"
+            "vaddy.z    $vf8, $vf8, $vf8         \n"
+            "vaddy.w    $vf9, $vf9, $vf9         \n"
+            "vaddw.x    $vf10, $vf6, $vf1        \n"
+            "vaddw.y    $vf10, $vf7, $vf2        \n"
+            "vaddw.z    $vf10, $vf8, $vf3        \n"
+            "vaddz.w    $vf10, $vf9, $vf9        \n"
+            "qmfc2      %[result], $vf10         \n"
+            : [result] "=&r"(result)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+              [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val)
+            : "memory");
         return vec_4(result);
     }
 
@@ -1255,49 +1370,58 @@ class transform_t {
 
     vec_4 get_row0() const
     {
+        vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, c3 = col3.vec128;
         vec128_t row;
-        asm(" ### transform_t::get_row0 ### \n"
-            "vaddx.x	%[row], vf00, %[col0] \n"
-            "vaddx.y	%[row], vf00, %[col1] \n"
-            "vaddx.z	%[row], vf00, %[col2] \n"
-            "vmulx.w	%[row], vf00, %[col3] \n"
-            : [row] "=&j"(row)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [col3] "j"(col3));
+        asm __volatile__ (" ### transform_t::get_row0 ### \n"
+            "qmtc2 %[c0], $vf1  \n"
+            "qmtc2 %[c1], $vf2  \n"
+            "qmtc2 %[c2], $vf3  \n"
+            "qmtc2 %[c3], $vf4  \n"
+            "vaddx.x $vf5, $vf0, $vf1 \n"
+            "vaddx.y $vf5, $vf0, $vf2 \n"
+            "vaddx.z $vf5, $vf0, $vf3 \n"
+            "vmulx.w $vf5, $vf0, $vf4 \n"
+            "qmfc2 %[row], $vf5  \n"
+            : [row] "=r"(row)
+            : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [c3] "r"(c3));
         return vec_4(row);
     }
 
     vec_4 get_row1() const
     {
+        vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, c3 = col3.vec128;
         vec128_t row;
-        asm(" ### transform_t::get_row1 ### \n"
-            "vaddy.x %[row], vf00, %[col0] \n"
-            "vaddy.y %[row], vf00, %[col1] \n"
-            "vaddy.z %[row], vf00, %[col2] \n"
-            "vmuly.w %[row], vf00, %[col3] \n"
-            : [row] "=&j"(row)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [col3] "j"(col3));
+        asm __volatile__ (" ### transform_t::get_row1 ### \n"
+            "qmtc2 %[c0], $vf1  \n"
+            "qmtc2 %[c1], $vf2  \n"
+            "qmtc2 %[c2], $vf3  \n"
+            "qmtc2 %[c3], $vf4  \n"
+            "vaddy.x $vf5, $vf0, $vf1 \n"
+            "vaddy.y $vf5, $vf0, $vf2 \n"
+            "vaddy.z $vf5, $vf0, $vf3 \n"
+            "vmuly.w $vf5, $vf0, $vf4 \n"
+            "qmfc2 %[row], $vf5  \n"
+            : [row] "=r"(row)
+            : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [c3] "r"(c3));
         return vec_4(row);
     }
 
     vec_4 get_row2() const
     {
         vec128_t row;
-        asm(" ### transform_t::get_row2 ### \n"
-            "vaddz.x %[row], vf00, %[col0] \n"
-            "vaddz.y %[row], vf00, %[col1] \n"
-            "vaddz.z %[row], vf00, %[col2] \n"
-            "vmulz.w %[row], vf00, %[col3] \n"
-            : [row] "=&j"(row)
-            : [col0] "j"(col0),
-            [col1] "j"(col1),
-            [col2] "j"(col2),
-            [col3] "j"(col3));
+        asm __volatile__ (" ### transform_t::get_row2 ### \n"
+            "lqc2 $vf1, 0(%[T])   \n"
+            "lqc2 $vf2, 16(%[T])  \n"
+            "lqc2 $vf3, 32(%[T])  \n"
+            "lqc2 $vf4, 48(%[T])  \n"
+            "vaddz.x $vf5, $vf0, $vf1 \n"
+            "vaddz.y $vf5, $vf0, $vf2 \n"
+            "vaddz.z $vf5, $vf0, $vf3 \n"
+            "vmulz.w $vf5, $vf0, $vf4 \n"
+            "qmfc2 %[row], $vf5  \n"
+            : [row] "=r"(row)
+            : [T] "r"(this)
+            : "memory");
         return vec_4(row);
     }
 
@@ -1327,26 +1451,31 @@ class transform_t {
     void
     orthonormal_inverse_in_place()
     {
-        vec128_t temp;
-        asm(
+        asm __volatile__ (
             "### transform_t::orthonormal_inverse_in_place ### \n"
-            "vadd.xz	%[temp], vf00, %[col1] \n"
-            "vaddx.y	%[temp], vf00, %[col2] \n"
-            "vaddy.x	%[col1], vf00, %[col0] \n"
-            "vaddy.z	%[col1], vf00, %[col2] \n"
-            "vaddz.x	%[col2], vf00, %[col0] \n"
-            "vaddy.z	%[col0], vf00, %[temp] \n"
-            "vaddx.y	%[col0], vf00, %[temp] \n"
-            "vaddz.y	%[col2], vf00, %[temp] \n"
-            "vadda	ACC, vf00, vf00 \n"
-            "vmsubay	ACC, %[col1], %[col3] \n"
-            "vmsubax	ACC, %[col0], %[col3] \n"
-            "vmsubz	%[col3], %[col2], %[col3] \n"
-            : [col0] "+j"(col0),
-            [col1] "+j"(col1),
-            [col2] "+j"(col2),
-            [col3] "+j"(col3),
-            [temp] "=j"(temp), "=r"(vu0_ACC));
+            "lqc2 $vf1, 0(%[T])    \n"
+            "lqc2 $vf2, 16(%[T])   \n"
+            "lqc2 $vf3, 32(%[T])   \n"
+            "lqc2 $vf4, 48(%[T])   \n"
+            "vadd.xz   $vf5, $vf0, $vf2  \n"
+            "vaddx.y   $vf5, $vf0, $vf3  \n"
+            "vaddy.x   $vf2, $vf0, $vf1  \n"
+            "vaddy.z   $vf2, $vf0, $vf3  \n"
+            "vaddz.x   $vf3, $vf0, $vf1  \n"
+            "vaddy.z   $vf1, $vf0, $vf5  \n"
+            "vaddx.y   $vf1, $vf0, $vf5  \n"
+            "vaddz.y   $vf3, $vf0, $vf5  \n"
+            "vsuba     $ACC, $vf0, $vf0  \n"
+            "vmsubay   $ACC, $vf2, $vf4  \n"
+            "vmsubax   $ACC, $vf1, $vf4  \n"
+            "vmsubz    $vf4, $vf3, $vf4  \n"
+            "sqc2 $vf1, 0(%[T])    \n"
+            "sqc2 $vf2, 16(%[T])   \n"
+            "sqc2 $vf3, 32(%[T])   \n"
+            "sqc2 $vf4, 48(%[T])   \n"
+            : "=r"(vu0_ACC)
+            : [T] "r"(this)
+            : "memory");
     }
 
     // matrix/vector operations
@@ -1354,50 +1483,70 @@ class transform_t {
     vec_4
     operator*(vec_4 vec) const
     {
+        vec128_t v = vec.vec128;
         vec128_t result;
-        asm(
+        asm __volatile__ (
             " ### transform_t * vec_4 ### \n"
-            "vmulax	ACC, %[col0], %[vec]		\n"
-            "vmadday	ACC, %[col1], %[vec]		\n"
-            "vmaddaz	ACC, %[col2], %[vec]		\n"
-            "vmaddw.xyz	%[result], %[col3], %[vec]	\n"
-            "vmove.w	%[result], %[vec]		\n"
-            : [result] "=&j"(result), "=r"(vu0_ACC)
-            : [vec] "j"(vec),
-            [col0] "j"(col0), [col1] "j"(col1),
-            [col2] "j"(col2), [col3] "j"(col3));
+            "lqc2 $vf1, 0(%[T])    \n"
+            "lqc2 $vf2, 16(%[T])   \n"
+            "lqc2 $vf3, 32(%[T])   \n"
+            "lqc2 $vf4, 48(%[T])   \n"
+            "qmtc2 %[v],  $vf5     \n"
+            "vsuba     $ACC, $vf0, $vf0  \n"
+            "vmulax    $ACC, $vf1, $vf5  \n"
+            "vmadday   $ACC, $vf2, $vf5  \n"
+            "vmaddaz   $ACC, $vf3, $vf5  \n"
+            "vmaddw    $vf6, $vf4, $vf5  \n"
+            "qmfc2 %[result], $vf6       \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [T] "r"(this), [v] "r"(v)
+            : "memory");
         return vec_4(result);
     }
 
     vector_t
     operator*(vector_t vec) const
     {
+        vec128_t v = vec.vec128;
         vec128_t result;
-        asm(
+        asm __volatile__ (
             " ### transform_t * vector_t ### \n"
-            "vmulax	ACC, %[col0], %[vec]		\n"
-            "vmadday	ACC, %[col1], %[vec]		\n"
-            "vmaddz	%[result], %[col2, %[vec]	\n"
-            : [result] "=&j"(result), "=r"(vu0_ACC)
-            : [vec] "j"(vec),
-            [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2));
+            "lqc2 $vf1, 0(%[T])    \n"
+            "lqc2 $vf2, 16(%[T])   \n"
+            "lqc2 $vf3, 32(%[T])   \n"
+            "qmtc2 %[v],  $vf4     \n"
+            "vsuba     $ACC, $vf0, $vf0  \n"
+            "vmulax    $ACC, $vf1, $vf4  \n"
+            "vmadday   $ACC, $vf2, $vf4  \n"
+            "vmaddz    $vf5, $vf3, $vf4  \n"
+            "qmfc2 %[result], $vf5       \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [T] "r"(this), [v] "r"(v)
+            : "memory");
         return vector_t(result);
     }
 
     point_t
     operator*(point_t pt) const
     {
+        vec128_t p = pt.vec128;
         vec128_t result;
-        asm(
+        asm __volatile__ (
             " ### transform_t * point_t ### \n"
-            "vmulax	ACC, %[col0], %[pt]		\n"
-            "vmadday	ACC, %[col1], %[pt]		\n"
-            "vmaddaz	ACC, %[col2], %[pt]		\n"
-            "vmaddw	%[result], %[col3], vf00	\n"
-            : [result] "=&j"(result), "=r"(vu0_ACC)
-            : [pt] "j"(pt),
-            [col0] "j"(col0), [col1] "j"(col1),
-            [col2] "j"(col2), [col3] "j"(col3));
+            "lqc2 $vf1, 0(%[T])    \n"
+            "lqc2 $vf2, 16(%[T])   \n"
+            "lqc2 $vf3, 32(%[T])   \n"
+            "lqc2 $vf4, 48(%[T])   \n"
+            "qmtc2 %[p],  $vf5     \n"
+            "vsuba     $ACC, $vf0, $vf0  \n"
+            "vmulax    $ACC, $vf1, $vf5  \n"
+            "vmadday   $ACC, $vf2, $vf5  \n"
+            "vmaddaz   $ACC, $vf3, $vf5  \n"
+            "vmaddw    $vf6, $vf4, $vf0  \n"
+            "qmfc2 %[result], $vf6       \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [T] "r"(this), [p] "r"(p)
+            : "memory");
         return point_t(result);
     }
 
@@ -1440,22 +1589,25 @@ inline mat_33
     vec_xyz::operator~() const
 {
     mat_33 result;
-    asm("### make tilde matrix from vec_xyz ### \n"
-        "vsub		%[res0], %[res0], %[res0] \n"
-        "vsub		%[res1], %[res1], %[res1] \n"
-        "vmr32		%[res2], vf00 \n"
-        "vaddw.x	%[res0], vf00, vf00 \n"
-        "vaddw.y	%[res1], vf00, vf00 \n"
-        "vopmula	ACC, %[vec], %[res0] \n"
-        "vopmsub	%[res0], %[res0], %[vec] \n"
-        "vopmula	ACC, %[vec], %[res1] \n"
-        "vopmsub	%[res1], %[res1], %[vec] \n"
-        "vopmula	ACC, %[vec], %[res2] \n"
-        "vopmsub	%[res2], %[res2], %[vec] \n"
-        : [res0] "=&j"(result.col0),
-        [res1] "=&j"(result.col1),
-        [res2] "=&j"(result.col2), "=r"(vu0_ACC)
-        : [vec] "j"(vec128));
+    vec128_t v = vec128;
+    asm __volatile__ ("### make tilde matrix from vec_xyz ### \n"
+        "qmtc2 %[v], $vf4              \n"
+        "vsub     $vf1, $vf0, $vf0     \n"
+        "vsub     $vf2, $vf0, $vf0     \n"
+        "vmr32    $vf3, $vf0           \n"
+        "vaddw.x  $vf1, $vf0, $vf0     \n"
+        "vaddw.y  $vf2, $vf0, $vf0     \n"
+        "vopmula  $ACC, $vf4, $vf1     \n"
+        "vopmsub  $vf1, $vf1, $vf4     \n"
+        "vopmula  $ACC, $vf4, $vf2     \n"
+        "vopmsub  $vf2, $vf2, $vf4     \n"
+        "vopmula  $ACC, $vf4, $vf3     \n"
+        "vopmsub  $vf3, $vf3, $vf4     \n"
+        "qmfc2 %[r0], $vf1             \n"
+        "qmfc2 %[r1], $vf2             \n"
+        "qmfc2 %[r2], $vf3             \n"
+        : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC)
+        : [v] "r"(v));
     return result;
 }
 
@@ -1483,68 +1635,70 @@ vec_xyz::tilde_mult(const mat_34& mat) const
 inline vec_xyz
     vec_xyz::operator*(const mat_33& mat) const
 {
-    vec128_t ones, temp0, temp1, temp2, result;
-
-    asm("### vec_xyz (row) * mat_33 ## \n"
-        "vmul		%[temp0], %[vec], %[col0] \n"
-        "vmaxw		%[ones], vf00, vf00 \n"
-        "vmul		%[temp1], %[vec], %[col1] \n"
-        "vmul		%[temp2], %[vec], %[col2] \n"
-        "vadday.x	ACC, %[temp0], %[temp0] \n"
-        "vmaddz.x	%[res], %[ones], %[temp0] \n"
-        "vaddax.y	ACC, %[temp1], %[temp1] \n"
-        "vmaddz.y	%[res], %[ones], %[temp1] \n"
-        "vaddax.z	ACC, %[temp2], %[temp2] \n"
-        "vmaddy.z	%[res], %[ones], %[temp2] \n"
-        : [res] "=&j"(result), [ones] "=&j"(ones),
-        [temp0] "=&j"(temp0),
-        [temp1] "=&j"(temp1),
-        [temp2] "=&j"(temp2), "=r"(vu0_ACC)
-        : [col0] "j"(mat.col0),
-        [col1] "j"(mat.col1),
-        [col2] "j"(mat.col2),
-        [vec] "j"(vec128));
+    vec128_t result;
+    vec128_t col0_val = mat.col0.vec128;
+    vec128_t col1_val = mat.col1.vec128;
+    vec128_t col2_val = mat.col2.vec128;
+    vec128_t vec_val = vec128;
+    asm __volatile__(
+        "### vec_xyz (row) * mat_33 ### \n"
+        "qmtc2      %[col0_val], $vf1        \n"
+        "qmtc2      %[col1_val], $vf2        \n"
+        "qmtc2      %[col2_val], $vf3        \n"
+        "qmtc2      %[vec_val], $vf4         \n"
+        "vmaxw      $vf5, $vf0, $vf0         \n"
+        "vmul       $vf6, $vf4, $vf1         \n"
+        "vmul       $vf7, $vf4, $vf2         \n"
+        "vmul       $vf8, $vf4, $vf3         \n"
+        "vadday.x   $ACC, $vf6, $vf6         \n"
+        "vmaddz.x   $vf9, $vf5, $vf6         \n"
+        "vaddax.y   $ACC, $vf7, $vf7         \n"
+        "vmaddz.y   $vf9, $vf5, $vf7         \n"
+        "vaddax.z   $ACC, $vf8, $vf8         \n"
+        "vmaddy.z   $vf9, $vf5, $vf8         \n"
+        "qmfc2      %[result], $vf9          \n"
+        : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+        : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+          [col2_val] "r"(col2_val), [vec_val] "r"(vec_val)
+        : "memory");
     return vec_xyz(result);
 }
 
 inline vec_xyzw
     vec_xyz::operator*(const mat_34& mat) const
 {
-    vec128_t ones, temp0, temp1, temp2, temp3, result;
-
-    asm("### vec_xyz (row) * mat_34 ## \n"
-        "vmul	%[temp0], %[vec], %[col0] \n"
-        "vmaxw	%[ones], vf00, vf00 \n"
-        "vmul	%[temp1], %[vec], %[col1] \n"
-        "vmul	%[temp2], %[vec], %[col2] \n"
-        "vmul	%[temp3], %[vec], %[col3] \n"
-        : [ones] "=&j"(ones),
-        [temp0] "=&j"(temp0),
-        [temp1] "=&j"(temp1),
-        [temp2] "=&j"(temp2),
-        [temp3] "=&j"(temp3)
-        : [col0] "j"(mat.col0),
-        [col1] "j"(mat.col1),
-        [col2] "j"(mat.col2),
-        [col3] "j"(mat.col3),
-        [vec] "j"(vec128));
-
-    asm("vmulx.w	%[temp3], vf00, %[temp3] \n"
-        "vadday.x	ACC, %[temp0], %[temp0] \n"
-        "vmaddz.x	%[res], %[ones], %[temp0] \n"
-        "vaddax.y	ACC, %[temp1], %[temp1] \n"
-        "vmaddz.y	%[res], %[ones], %[temp1] \n"
-        "vaddax.z	ACC, %[temp2], %[temp2] \n"
-        "vmaddy.z	%[res], %[ones], %[temp2] \n"
-        "vadday.w	ACC, %[temp3], %[temp3] \n"
-        "vmaddz.w	%[res], %[ones], %[temp3] \n"
-        : [res] "=&j"(result), "=r"(vu0_ACC)
-        : [ones] "j"(ones),
-        [temp0] "j"(temp0),
-        [temp1] "j"(temp1),
-        [temp2] "j"(temp2),
-        [temp3] "j"(temp3));
-
+    vec128_t result;
+    vec128_t col0_val = mat.col0.vec128;
+    vec128_t col1_val = mat.col1.vec128;
+    vec128_t col2_val = mat.col2.vec128;
+    vec128_t col3_val = mat.col3.vec128;
+    vec128_t vec_val = vec128;
+    asm __volatile__(
+        "### vec_xyz (row) * mat_34 ### \n"
+        "qmtc2      %[col0_val], $vf1        \n"
+        "qmtc2      %[col1_val], $vf2        \n"
+        "qmtc2      %[col2_val], $vf3        \n"
+        "qmtc2      %[col3_val], $vf4        \n"
+        "qmtc2      %[vec_val], $vf5         \n"
+        "vmaxw      $vf6, $vf0, $vf0         \n"
+        "vmul       $vf7, $vf5, $vf1         \n"
+        "vmul       $vf8, $vf5, $vf2         \n"
+        "vmul       $vf9, $vf5, $vf3         \n"
+        "vmul       $vf10, $vf5, $vf4        \n"
+        "vmulx.w    $vf10, $vf0, $vf10       \n"
+        "vadday.x   $ACC, $vf7, $vf7         \n"
+        "vmaddz.x   $vf11, $vf6, $vf7        \n"
+        "vaddax.y   $ACC, $vf8, $vf8         \n"
+        "vmaddz.y   $vf11, $vf6, $vf8        \n"
+        "vaddax.z   $ACC, $vf9, $vf9         \n"
+        "vmaddy.z   $vf11, $vf6, $vf9        \n"
+        "vadday.w   $ACC, $vf10, $vf10       \n"
+        "vmaddz.w   $vf11, $vf6, $vf10       \n"
+        "qmfc2      %[result], $vf11         \n"
+        : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+        : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+          [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val)
+        : "memory");
     return vec_xyzw(result);
 }
 
@@ -1573,75 +1727,76 @@ vec_xyz::tensor_mult(const vec_4 vec) const
 inline vec_xyz
     vec_xyzw::operator*(const mat_43& mat) const
 {
-    vec128_t ones, temp0, temp1, temp2, result;
-
-    asm("### vec_xyzw (row) * mat_43 ## \n"
-        "vmul		%[temp0], %[vec], %[col0] \n"
-        "vmaxw		%[ones], vf00, vf00 \n"
-        "vmul		%[temp1], %[vec], %[col1] \n"
-        "vmul		%[temp2], %[vec], %[col2] \n"
-        "vadday.x	ACC, %[temp0], %[temp0] \n"
-        "vmaddaz.x	ACC, %[ones], %[temp0] \n"
-        "vmaddw.x	%[res], %[ones], %[temp0] \n"
-        "vaddax.y	ACC, %[temp1], %[temp1] \n"
-        "vmaddaz.y	ACC, %[ones], %[temp1] \n"
-        "vmaddw.y	%[res], %[ones], %[temp1] \n"
-        "vaddax.z	ACC, %[temp2], %[temp2] \n"
-        "vmadday.z	ACC, %[ones], %[temp2] \n"
-        "vmaddw.z	%[res], %[ones], %[temp2] \n"
-        : [res] "=&j"(result), [ones] "=&j"(ones),
-        [temp0] "=&j"(temp0),
-        [temp1] "=&j"(temp1),
-        [temp2] "=&j"(temp2), "=r"(vu0_ACC)
-        : [col0] "j"(mat.col0),
-        [col1] "j"(mat.col1),
-        [col2] "j"(mat.col2),
-        [vec] "j"(vec128));
-
+    vec128_t result;
+    vec128_t col0_val = mat.col0.vec128;
+    vec128_t col1_val = mat.col1.vec128;
+    vec128_t col2_val = mat.col2.vec128;
+    vec128_t vec_val = vec128;
+    asm __volatile__(
+        "### vec_xyzw (row) * mat_43 ### \n"
+        "qmtc2      %[col0_val], $vf1        \n"
+        "qmtc2      %[col1_val], $vf2        \n"
+        "qmtc2      %[col2_val], $vf3        \n"
+        "qmtc2      %[vec_val], $vf4         \n"
+        "vmaxw      $vf5, $vf0, $vf0         \n"
+        "vmul       $vf6, $vf4, $vf1         \n"
+        "vmul       $vf7, $vf4, $vf2         \n"
+        "vmul       $vf8, $vf4, $vf3         \n"
+        "vadday.x   $ACC, $vf6, $vf6         \n"
+        "vmaddaz.x  $ACC, $vf5, $vf6         \n"
+        "vmaddw.x   $vf9, $vf5, $vf6         \n"
+        "vaddax.y   $ACC, $vf7, $vf7         \n"
+        "vmaddaz.y  $ACC, $vf5, $vf7         \n"
+        "vmaddw.y   $vf9, $vf5, $vf7         \n"
+        "vaddax.z   $ACC, $vf8, $vf8         \n"
+        "vmadday.z  $ACC, $vf5, $vf8         \n"
+        "vmaddw.z   $vf9, $vf5, $vf8         \n"
+        "qmfc2      %[result], $vf9          \n"
+        : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+        : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+          [col2_val] "r"(col2_val), [vec_val] "r"(vec_val)
+        : "memory");
     return vec_xyz(result);
 }
 
 inline vec_xyzw
     vec_xyzw::operator*(const mat_44& mat) const
 {
-    vec128_t ones, temp0, temp1, temp2, temp3, result;
-
-    asm("### vec_xyzw (row) * mat_44 ## \n"
-        "vmaxw	%[ones], vf00, vf00 \n"
-        "vmul	%[temp0], %[vec], %[col0] \n"
-        "vmul	%[temp1], %[vec], %[col1] \n"
-        "vmul	%[temp2], %[vec], %[col2] \n"
-        "vmul	%[temp3], %[vec], %[col3] \n"
-        : [ones] "=&j"(ones),
-        [temp0] "=&j"(temp0),
-        [temp1] "=&j"(temp1),
-        [temp2] "=&j"(temp2),
-        [temp3] "=&j"(temp3)
-        : [col0] "j"(mat.col0),
-        [col1] "j"(mat.col1),
-        [col2] "j"(mat.col2),
-        [col3] "j"(mat.col3),
-        [vec] "j"(vec128));
-
-    asm("vadday.x	ACC, %[temp0], %[temp0] \n"
-        "vmaddaz.x	ACC, %[ones], %[temp0] \n"
-        "vmaddw.x	%[res], %[ones], %[temp0] \n"
-        "vaddax.y	ACC, %[temp1], %[temp1] \n"
-        "vmaddaz.y	ACC, %[ones], %[temp1] \n"
-        "vmaddw.y	%[res], %[ones], %[temp1] \n"
-        "vaddax.z	ACC, %[temp2], %[temp2] \n"
-        "vmadday.z	ACC, %[ones], %[temp2] \n"
-        "vmaddw.z	%[res], %[ones], %[temp2] \n"
-        "vaddax.w	ACC, %[temp3], %[temp3] \n"
-        "vmadday.w	ACC, %[ones], %[temp3] \n"
-        "vmaddz.w	%[res], %[ones], %[temp3] \n"
-        : [res] "=&j"(result), "=r"(vu0_ACC)
-        : [ones] "j"(ones),
-        [temp0] "j"(temp0),
-        [temp1] "j"(temp1),
-        [temp2] "j"(temp2),
-        [temp3] "j"(temp3));
-
+    vec128_t result;
+    vec128_t col0_val = mat.col0.vec128;
+    vec128_t col1_val = mat.col1.vec128;
+    vec128_t col2_val = mat.col2.vec128;
+    vec128_t col3_val = mat.col3.vec128;
+    vec128_t vec_val = vec128;
+    asm __volatile__(
+        "### vec_xyzw (row) * mat_44 ### \n"
+        "qmtc2      %[col0_val], $vf1        \n"
+        "qmtc2      %[col1_val], $vf2        \n"
+        "qmtc2      %[col2_val], $vf3        \n"
+        "qmtc2      %[col3_val], $vf4        \n"
+        "qmtc2      %[vec_val], $vf5         \n"
+        "vmaxw      $vf6, $vf0, $vf0         \n"
+        "vmul       $vf7, $vf5, $vf1         \n"
+        "vmul       $vf8, $vf5, $vf2         \n"
+        "vmul       $vf9, $vf5, $vf3         \n"
+        "vmul       $vf10, $vf5, $vf4        \n"
+        "vadday.x   $ACC, $vf7, $vf7         \n"
+        "vmaddaz.x  $ACC, $vf6, $vf7         \n"
+        "vmaddw.x   $vf11, $vf6, $vf7        \n"
+        "vaddax.y   $ACC, $vf8, $vf8         \n"
+        "vmaddaz.y  $ACC, $vf6, $vf8         \n"
+        "vmaddw.y   $vf11, $vf6, $vf8        \n"
+        "vaddax.z   $ACC, $vf9, $vf9         \n"
+        "vmadday.z  $ACC, $vf6, $vf9         \n"
+        "vmaddw.z   $vf11, $vf6, $vf9        \n"
+        "vaddax.w   $ACC, $vf10, $vf10       \n"
+        "vmadday.w  $ACC, $vf6, $vf10        \n"
+        "vmaddz.w   $vf11, $vf6, $vf10       \n"
+        "qmfc2      %[result], $vf11         \n"
+        : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+        : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val),
+          [col2_val] "r"(col2_val), [col3_val] "r"(col3_val), [vec_val] "r"(vec_val)
+        : "memory");
     return vec_xyzw(result);
 }
 
@@ -1742,43 +1897,40 @@ inline mat_33
 mat_33::inverse() const
 {
     mat_33 result;
-    vec128_t temp, determinant;
-
-    asm("### mat_33::inverse ### \n"
-
-        "vopmula.xyz	ACC, %[col0], %[col1]			# inv2 = crossproduct(col0, col1) \n"
-        "vopmsub.xyz	%[inv2], %[col1], %[col0] \n"
-        "vopmula.xyz	ACC, %[col1], %[col2]			# inv0 = crossproduct(col1, col2) \n"
-        "vopmsub.xyz	%[inv0], %[col2], %[col1] \n"
-        "							# stall \n"
-        "vmul		%[determinant], %[col2], %[inv2]	# determinant(R) = dotproduct(col2, inv2) \n"
-        "vaddw.x	%[temp], vf00, vf00  \n"
-        "vopmula.xyz	ACC, %[col2], %[col0]			# inv1 = crossproduct(col2, col0) \n"
-        "vopmsub.xyz	%[inv1], %[col0], %[col2]  \n"
-        "vadday.x	ACCx, %[determinant], %[determinant] \n"
-        "vmaddz.x	%[determinant], %[temp], %[determinant] \n"
-
-        "vaddx.y	%[temp], vf00], %[inv2]			# Do an in-place transpose, produces determinant(R)*Rinv \n"
-        "vadd.xz	%[temp], vf00], %[inv1] \n"
-        "vaddy.x	%[inv1], vf00], %[inv0] \n"
-        "vdiv		Q, vf00w, %[determinantx]		# Q = 1/determinant(R) \n"
-        "vaddy.z	%[inv1], vf00, %[inv2] \n"
-        "vaddz.x	%[inv2], vf00, %[inv0] \n"
-        "vaddy.z	%[inv0], vf00, %[temp] \n"
-        "vaddx.y	%[inv0], vf00, %[temp] \n"
-        "vaddz.y	%[inv2], vf00, %[temp] \n"
-        "vwaitq \n"
-        "vmulq.xyz	%[inv1], %[inv1], Q \n"
-        "vmulq.xyz	%[inv0], %[inv0], Q			# Multiply by 1/determinant(R) \n"
-        "vmulq.xyz	%[inv2], %[inv2], Q \n"
-
-        : [inv0] "=&j"(result.col0),
-        [inv1] "=&j"(result.col1),
-        [inv2] "=&j"(result.col2),
-        [temp] "=&j"(temp), [determinant] "=&j"(determinant), "=r"(vu0_ACC)
-        : [col0] "j"(col0),
-        [col1] "j"(col1),
-        [col2] "j"(col2));
+    vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128;
+
+    asm __volatile__ ("### mat_33::inverse ### \n"
+        "qmtc2 %[c0], $vf1                              \n"
+        "qmtc2 %[c1], $vf2                              \n"
+        "qmtc2 %[c2], $vf3                              \n"
+        "vopmula.xyz $ACC, $vf1, $vf2  # inv2 = col0 x col1 \n"
+        "vopmsub.xyz $vf6, $vf2, $vf1                   \n"
+        "vopmula.xyz $ACC, $vf2, $vf3  # inv0 = col1 x col2 \n"
+        "vopmsub.xyz $vf4, $vf3, $vf2                   \n"
+        "vmul        $vf9, $vf3, $vf6  # det components     \n"
+        "vaddw.x     $vf8, $vf0, $vf0  # temp.x = 1.0       \n"
+        "vopmula.xyz $ACC, $vf3, $vf1  # inv1 = col2 x col0 \n"
+        "vopmsub.xyz $vf5, $vf1, $vf3                   \n"
+        "vadday.x    $ACC, $vf9, $vf9                   \n"
+        "vmaddz.x    $vf9, $vf8, $vf9  # det = det.x+det.y+det.z \n"
+        "vaddx.y     $vf8, $vf0, $vf6  # transpose           \n"
+        "vadd.xz     $vf8, $vf0, $vf5                   \n"
+        "vaddy.x     $vf5, $vf0, $vf4                   \n"
+        "vdiv        $Q, $vf0w, $vf9x   # Q = 1/det          \n"
+        "vaddy.z     $vf5, $vf0, $vf6                   \n"
+        "vaddz.x     $vf6, $vf0, $vf4                   \n"
+        "vaddy.z     $vf4, $vf0, $vf8                   \n"
+        "vaddx.y     $vf4, $vf0, $vf8                   \n"
+        "vaddz.y     $vf6, $vf0, $vf8                   \n"
+        "vwaitq                                         \n"
+        "vmulq.xyz   $vf5, $vf5, $Q                      \n"
+        "vmulq.xyz   $vf4, $vf4, $Q     # scale by 1/det     \n"
+        "vmulq.xyz   $vf6, $vf6, $Q                      \n"
+        "qmfc2 %[r0], $vf4                              \n"
+        "qmfc2 %[r1], $vf5                              \n"
+        "qmfc2 %[r2], $vf6                              \n"
+        : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC)
+        : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2));
     return result;
 }
 
@@ -1842,15 +1994,23 @@ inline mat_33
 mat_33::mult_tilde(vec_3 vec) const
 {
     mat_33 result;
-    asm("### mat_33 mult_tilde vec_3 ### \n"
-        "vmulaz	ACC, %[col1], %[vec] \n"
-        "vmsuby	%[res0], %[col2, %[vec] \n"
-        "vmulax	ACC, %[col2, %[vec] \n"
-        "vmsubz	%[res1], %[col0], %[vec] \n"
-        "vmulay	ACC, %[col0, %[vec] \n"
-        "vmsubx	%[res2], %[col1], %[vec] \n"
-        : [res0] "=&j"(result.col0), [res1] "=&j"(result.col1), [res2] "=&j"(result.col2), "=r"(vu0_ACC)
-        : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec));
+    vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, v = vec.vec128;
+    asm __volatile__ ("### mat_33 mult_tilde vec_3 ### \n"
+        "qmtc2 %[c0], $vf1  \n"
+        "qmtc2 %[c1], $vf2  \n"
+        "qmtc2 %[c2], $vf3  \n"
+        "qmtc2 %[v],  $vf4  \n"
+        "vmulaz $ACC, $vf2, $vf4 \n"
+        "vmsuby $vf5, $vf3, $vf4 \n"
+        "vmulax $ACC, $vf3, $vf4 \n"
+        "vmsubz $vf6, $vf1, $vf4 \n"
+        "vmulay $ACC, $vf1, $vf4 \n"
+        "vmsubx $vf7, $vf2, $vf4 \n"
+        "qmfc2 %[r0], $vf5  \n"
+        "qmfc2 %[r1], $vf6  \n"
+        "qmfc2 %[r2], $vf7  \n"
+        : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC)
+        : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [v] "r"(v));
     return result;
 }
 
@@ -2287,15 +2447,23 @@ inline mat_43
 mat_43::mult_tilde(vec_3 vec) const
 {
     mat_43 result;
-    asm("### mat_43 mult_tilde vec_3 ### \n"
-        "vmulaz	ACC, %[col1], %[vec] \n"
-        "vmsuby	%[res0], %[col2], %[vec] \n"
-        "vmulax	ACC, %[col2], %[vec] \n"
-        "vmsubz	%[res1], %[col0], %[vec] \n"
-        "vmulay	ACC, %[col0], %[vec] \n"
-        "vmsubx	%[res2], %[col1], %[vec] \n"
-        : [res0] "=&j"(result.col0), [res1] "=&j"(result.col1), [res2] "=&j"(result.col2), "=r"(vu0_ACC)
-        : [col0] "j"(col0), [col1] "j"(col1), [col2] "j"(col2), [vec] "j"(vec));
+    vec128_t c0 = col0.vec128, c1 = col1.vec128, c2 = col2.vec128, v = vec.vec128;
+    asm __volatile__ ("### mat_43 mult_tilde vec_3 ### \n"
+        "qmtc2 %[c0], $vf1  \n"
+        "qmtc2 %[c1], $vf2  \n"
+        "qmtc2 %[c2], $vf3  \n"
+        "qmtc2 %[v],  $vf4  \n"
+        "vmulaz $ACC, $vf2, $vf4 \n"
+        "vmsuby $vf5, $vf3, $vf4 \n"
+        "vmulax $ACC, $vf3, $vf4 \n"
+        "vmsubz $vf6, $vf1, $vf4 \n"
+        "vmulay $ACC, $vf1, $vf4 \n"
+        "vmsubx $vf7, $vf2, $vf4 \n"
+        "qmfc2 %[r0], $vf5  \n"
+        "qmfc2 %[r1], $vf6  \n"
+        "qmfc2 %[r2], $vf7  \n"
+        : [r0] "=&r"(result.col0), [r1] "=&r"(result.col1), [r2] "=&r"(result.col2), "=r"(vu0_ACC)
+        : [c0] "r"(c0), [c1] "r"(c1), [c2] "r"(c2), [v] "r"(v));
     return result;
 }
 
@@ -2620,56 +2788,47 @@ inline transform_t
 transform_t::inverse() const
 {
     transform_t result;
-    vec128_t temp, determinant;
-
-    asm("### transform_t::inverse, part I ### \n"
-        "vopmula.xyz	ACC, %[col0], %[col1]			# inv2 = crossproduct(col0, col1) \n"
-        "vopmsub.xyz	%[inv2], %[col1], %[col0] \n"
-        "vopmula.xyz	ACC, %[col1], %[col2]			# inv0 = crossproduct(col1, col2) \n"
-        "vopmsub.xyz	%[inv0], %[col2], %[col1] \n"
-        "							# stall \n"
-        "vmul		%[determinant], %[col2], %[inv2]	# determinant(R) = dotproduct(col2, inv2) \n"
-        "vaddw.x	%[temp], vf00, vf00  \n"
-        "vopmula.xyz	ACC, %[col2], %[col0]			# inv1 = crossproduct(col2, col0) \n"
-        "vopmsub.xyz	%[inv1], %[col0], %[col2]  \n"
-        "vadday.x	ACC, %[determinant], %[determinant] \n"
-        "vmaddz.x	%[determinant], %[temp], %[determinant] \n"
-
-        "vaddx.y	%[temp], vf00, %[inv2]			# Do an in-place transpose, produces determinant(R)*Rinv \n"
-        "vadd.xz	%[temp], vf00, %[inv1] \n"
-        "vaddy.x	%[inv1], vf00, %[inv0] \n"
-        "vdiv		Q, vf00w, %[determinant]x		# Q = 1/determinant(R) \n"
-        "vaddy.z	%[inv1], vf00, %[inv2] \n"
-        "vaddz.x	%[inv2], vf00, %[inv0] \n"
-        "vaddy.z	%[inv0], vf00, %[temp] \n"
-        "vaddx.y	%[inv0], vf00, %[temp] \n"
-        "vaddz.y	%[inv2], vf00, %[temp] \n"
-        : [inv0] "=&j"(result.col0),
-        [inv1] "=&j"(result.col1),
-        [inv2] "=&j"(result.col2),
-        [temp] "=&j"(temp),
-        [determinant] "=&j"(determinant), "=r"(vu0_ACC), "=j"(vu0_Q)
-        : [col0] "j"(col0),
-        [col1] "j"(col1),
-        [col2] "j"(col2));
-
-    asm("### transform_t::inverse, part II ### \n"
-
-        "vadda		ACC, vf00, vf00				# Compute determinant(R)*Rinv -T \n"
-        "vmsubay	ACC, %[inv1], %[col3] \n"
-        "vmsubax	ACC, %[inv0], %[col3] \n"
-        "vmsubz	%[inv3], %[inv2], %[col3] \n"
-
-        "vmulq.xyz	%[inv0], %[inv0], Q			# Multiply by 1/determinant(R) \n"
-        "vmulq.xyz	%[inv1], %[inv1], Q \n"
-        "vmulq.xyz	%[inv2], %[inv2], Q \n"
-        "vmulq.xyz	%[inv3], %[inv3], Q \n"
-        : [inv0] "+j"(result.col0),
-        [inv1] "+j"(result.col1),
-        [inv2] "+j"(result.col2),
-        [inv3] "=&j"(result.col3),
-        "=r"(vu0_ACC)
-        : [col3] "j"(col3), "j"(vu0_Q));
+
+    asm __volatile__ ("### transform_t::inverse ### \n"
+        "lqc2 $vf1, 0(%[T])                             \n"
+        "lqc2 $vf2, 16(%[T])                            \n"
+        "lqc2 $vf3, 32(%[T])                            \n"
+        "lqc2 $vf4, 48(%[T])                            \n"
+        "vopmula.xyz $ACC, $vf1, $vf2  # inv2 = col0 x col1 \n"
+        "vopmsub.xyz $vf7, $vf2, $vf1                   \n"
+        "vopmula.xyz $ACC, $vf2, $vf3  # inv0 = col1 x col2 \n"
+        "vopmsub.xyz $vf5, $vf3, $vf2                   \n"
+        "vmul        $vf9, $vf3, $vf7  # det components     \n"
+        "vaddw.x     $vf8, $vf0, $vf0  # temp.x = 1.0       \n"
+        "vopmula.xyz $ACC, $vf3, $vf1  # inv1 = col2 x col0 \n"
+        "vopmsub.xyz $vf6, $vf1, $vf3                   \n"
+        "vadday.x    $ACC, $vf9, $vf9                   \n"
+        "vmaddz.x    $vf9, $vf8, $vf9  # det = det.x+det.y+det.z \n"
+        "vaddx.y     $vf8, $vf0, $vf7  # transpose           \n"
+        "vadd.xz     $vf8, $vf0, $vf6                   \n"
+        "vaddy.x     $vf6, $vf0, $vf5                   \n"
+        "vdiv        $Q, $vf0w, $vf9x   # Q = 1/det          \n"
+        "vaddy.z     $vf6, $vf0, $vf7                   \n"
+        "vaddz.x     $vf7, $vf0, $vf5                   \n"
+        "vaddy.z     $vf5, $vf0, $vf8                   \n"
+        "vaddx.y     $vf5, $vf0, $vf8                   \n"
+        "vaddz.y     $vf7, $vf0, $vf8                   \n"
+        "vsuba       $ACC, $vf0, $vf0  # compute -R^T*t      \n"
+        "vmsubay     $ACC, $vf6, $vf4                   \n"
+        "vmsubax     $ACC, $vf5, $vf4                   \n"
+        "vmsubz      $vf9, $vf7, $vf4                   \n"
+        "vwaitq                                         \n"
+        "vmulq.xyz   $vf5, $vf5, $Q     # scale by 1/det     \n"
+        "vmulq.xyz   $vf6, $vf6, $Q                      \n"
+        "vmulq.xyz   $vf7, $vf7, $Q                      \n"
+        "vmulq.xyz   $vf9, $vf9, $Q                      \n"
+        "sqc2 $vf5, 0(%[R])                              \n"
+        "sqc2 $vf6, 16(%[R])                             \n"
+        "sqc2 $vf7, 32(%[R])                             \n"
+        "sqc2 $vf9, 48(%[R])                             \n"
+        : "=r"(vu0_ACC)
+        : [T] "r"(this), [R] "r"(&result)
+        : "memory");
     return result;
 }
 
@@ -2677,28 +2836,31 @@ inline transform_t
 transform_t::orthonormal_inverse() const
 {
     transform_t result;
-    asm("### transform_t::orthonormal_inverse ### \n"
-        "vadd.x	%[inv0], vf00, %[col0] \n"
-        "vadd.y	%[inv1], vf00, %[col1] \n"
-        "vadd.z	%[inv2], vf00, %[col2] \n"
-        "vaddx.y	%[inv0], vf00, %[col1] \n"
-        "vaddy.x	%[inv1], vf00, %[col0] \n"
-        "vaddz.x	%[inv2], vf00, %[col0] \n"
-        "vaddx.z	%[inv0], vf00, %[col2] \n"
-        "vaddy.z	%[inv1], vf00, %[col2] \n"
-        "vaddz.y	%[inv2], vf00, %[col1] \n"
-        "vadda		ACC, vf00, vf00 \n"
-        "vmsubax	ACC, %[inv0], %[col3] \n"
-        "vmsubay	ACC, %[inv1], %[col3] \n"
-        "vmsubz	%[inv3], %[inv2], %[col3] \n"
-        : [inv0] "=&j"(result.col0),
-        [inv1] "=&j"(result.col1),
-        [inv2] "=&j"(result.col2),
-        [inv3] "=&j"(result.col3), "=r"(vu0_ACC)
-        : [col0] "j"(col0),
-        [col1] "j"(col1),
-        [col2] "j"(col2),
-        [col3] "j"(col3));
+    asm __volatile__ ("### transform_t::orthonormal_inverse ### \n"
+        "lqc2 $vf1, 0(%[T])    \n"
+        "lqc2 $vf2, 16(%[T])   \n"
+        "lqc2 $vf3, 32(%[T])   \n"
+        "lqc2 $vf4, 48(%[T])   \n"
+        "vadd.x    $vf5, $vf0, $vf1  \n"
+        "vadd.y    $vf6, $vf0, $vf2  \n"
+        "vadd.z    $vf7, $vf0, $vf3  \n"
+        "vaddx.y   $vf5, $vf0, $vf2  \n"
+        "vaddy.x   $vf6, $vf0, $vf1  \n"
+        "vaddz.x   $vf7, $vf0, $vf1  \n"
+        "vaddx.z   $vf5, $vf0, $vf3  \n"
+        "vaddy.z   $vf6, $vf0, $vf3  \n"
+        "vaddz.y   $vf7, $vf0, $vf2  \n"
+        "vsuba     $ACC, $vf0, $vf0  \n"
+        "vmsubax   $ACC, $vf5, $vf4  \n"
+        "vmsubay   $ACC, $vf6, $vf4  \n"
+        "vmsubz    $vf8, $vf7, $vf4  \n"
+        "sqc2 $vf5, 0(%[R])   \n"
+        "sqc2 $vf6, 16(%[R])  \n"
+        "sqc2 $vf7, 32(%[R])  \n"
+        "sqc2 $vf8, 48(%[R])  \n"
+        : "=r"(vu0_ACC)
+        : [T] "r"(this), [R] "r"(&result)
+        : "memory");
     return result;
 }
 
diff --git a/include/ps2s/matrix_common.h b/include/ps2s/matrix_common.h
index dcdb487..10432fa 100644
--- a/include/ps2s/matrix_common.h
+++ b/include/ps2s/matrix_common.h
@@ -34,14 +34,16 @@ class mat_x3_template {
 
     void set_zero()
     {
-        asm(
-            " ### mat_x3::set_zero ### \n"
-            "vsub    col0, col0, col0 \n"
-            "vsub    col1, col1, col1 \n"
-            "vsub    col2, col2, col2 \n"
-            : "=j col0"(col0.vec128),
-            "=j col1"(col1.vec128),
-            "=j col2"(col2.vec128));
+        asm __volatile__(
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "vsub       $vf2, $vf0, $vf0         \n"
+            "vsub       $vf3, $vf0, $vf0         \n"
+            "qmfc2      %[col0], $vf1           \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128)
+            :
+            : "memory");
     }
 
     void set_col0(column_type new_col) { col0 = new_col; }
@@ -50,54 +52,78 @@ class mat_x3_template {
 
     void set_row0(vec_3 new_row)
     {
-        asm(
-            " ### mat_x3::set_row0 ### \n"
-            "vaddx.x       col0, vf00, new_row \n"
-            "vaddy.x       col1, vf00, new_row \n"
-            "vaddz.x       col2, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[new_row_val], $vf4    \n"
+            "vaddx.x    $vf1, $vf0, $vf4         \n"
+            "vaddy.x    $vf2, $vf0, $vf4         \n"
+            "vaddz.x    $vf3, $vf0, $vf4         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     void set_row1(vec_3 new_row)
     {
-        asm(
-            " ### mat_x3::set_row1 ### \n"
-            "vaddx.y       col0, vf00, new_row \n"
-            "vaddy.y       col1, vf00, new_row \n"
-            "vaddz.y       col2, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[new_row_val], $vf4    \n"
+            "vaddx.y    $vf1, $vf0, $vf4         \n"
+            "vaddy.y    $vf2, $vf0, $vf4         \n"
+            "vaddz.y    $vf3, $vf0, $vf4         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     void set_row2(vec_3 new_row)
     {
-        asm(
-            " ### mat_x3::set_row2 ### \n"
-            "vaddx.z        col0, vf00, new_row \n"
-            "vaddy.z        col1, vf00, new_row \n"
-            "vaddz.z        col2, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[new_row_val], $vf4    \n"
+            "vaddx.z    $vf1, $vf0, $vf4         \n"
+            "vaddy.z    $vf2, $vf0, $vf4         \n"
+            "vaddz.z    $vf3, $vf0, $vf4         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     void set_row3(vec_3 new_row)
     {
-        asm(
-            " ### mat_x3::set_row3 ### \n"
-            "vmulx.w        col0, vf00, new_row \n"
-            "vmuly.w        col1, vf00, new_row \n"
-            "vmulz.w        col2, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[new_row_val], $vf4    \n"
+            "vmulx.w    $vf1, $vf0, $vf4         \n"
+            "vmuly.w    $vf2, $vf0, $vf4         \n"
+            "vmulz.w    $vf3, $vf0, $vf4         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     column_type get_col0() const { return col0; }
@@ -107,60 +133,84 @@ class mat_x3_template {
     vec_3 get_row0() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x3::get_row0 ### \n"
-            "vaddx.x row_, vf00, col0 \n"
-            "vaddx.y row_, vf00, col1 \n"
-            "vaddx.z row_, vf00, col2 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "vsub       $vf4, $vf0, $vf0         \n"
+            "vaddx.x $vf4, $vf0, $vf1        \n"
+            "vaddx.y $vf4, $vf0, $vf2        \n"
+            "vaddx.z $vf4, $vf0, $vf3        \n"
+            "qmfc2      %[row], $vf4             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val)
+            : "memory");
         return vec_3(row);
     }
 
     vec_3 get_row1() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x3::get_row1 ### \n"
-            "vaddy.x row_, vf00, col0 \n"
-            "vaddy.y row_, vf00, col1 \n"
-            "vaddy.z row_, vf00, col2 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "vsub       $vf4, $vf0, $vf0         \n"
+            "vaddy.x $vf4, $vf0, $vf1        \n"
+            "vaddy.y $vf4, $vf0, $vf2        \n"
+            "vaddy.z $vf4, $vf0, $vf3        \n"
+            "qmfc2      %[row], $vf4             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val)
+            : "memory");
         return vec_3(row);
     }
 
     vec_3 get_row2() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x3::get_row2 ### \n"
-            "vaddz.x row_, vf00, col0 \n"
-            "vaddz.y row_, vf00, col1 \n"
-            "vaddz.z row_, vf00, col2 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "vsub       $vf4, $vf0, $vf0         \n"
+            "vaddz.x $vf4, $vf0, $vf1        \n"
+            "vaddz.y $vf4, $vf0, $vf2        \n"
+            "vaddz.z $vf4, $vf0, $vf3        \n"
+            "qmfc2      %[row], $vf4             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val)
+            : "memory");
         return vec_3(row);
     }
 
     vec_3 get_row3() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x3::get_row3 ### \n"
-            "vaddw.x row_, vf00, col0 \n"
-            "vaddw.y row_, vf00, col1 \n"
-            "vaddw.z row_, vf00, col2 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "vsub       $vf4, $vf0, $vf0         \n"
+            "vaddw.x $vf4, $vf0, $vf1        \n"
+            "vaddw.y $vf4, $vf0, $vf2        \n"
+            "vaddw.z $vf4, $vf0, $vf3        \n"
+            "qmfc2      %[row], $vf4             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val)
+            : "memory");
         return vec_3(row);
     }
 
@@ -168,14 +218,23 @@ class mat_x3_template {
     operator*(vec_3 vec) const
     {
         vec128_t result;
-        asm(
-            " ### mat_x3 * vec_3 ### \n"
-            "vmulax    ACC, col0, vec      \n"
-            "vmadday    ACC, col1, vec      \n"
-            "vmaddz    result, col2, vec      \n"
-            : "=&j result"(result), "=r"(vu0_ACC)
-            : "j vec"(vec.vec128),
-            "j col0"(col0.vec128), "j col1"(col1.vec128), "j col2"(col2.vec128));
+        vec128_t vec_val = vec.vec128;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[vec_val], $vf4         \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulax     $ACC, $vf1, $vf4         \n"
+            "vmadday    $ACC, $vf2, $vf4         \n"
+            "vmaddz     $vf5, $vf3, $vf4         \n"
+            "qmfc2      %[result], $vf5          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [vec_val] "r"(vec_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -184,17 +243,28 @@ class mat_x3_template {
     mult_trans_col0(const mat_x3_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x3 mult_trans mat_x3, column 0 of result ### \n"
-            "vmulax ACC, col0, mat0 \n"
-            "vmaddax ACC, col1, mat1 \n"
-            "vmaddx res0, col2, mat2 \n"
-            : "=&j res0"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[mat0_val], $vf4        \n"
+            "qmtc2      %[mat1_val], $vf5        \n"
+            "qmtc2      %[mat2_val], $vf6        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulax     $ACC, $vf1, $vf4         \n"
+            "vmaddax    $ACC, $vf2, $vf5         \n"
+            "vmaddx     $vf7, $vf3, $vf6         \n"
+            "qmfc2      %[result], $vf7          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -203,17 +273,28 @@ class mat_x3_template {
     mult_trans_col1(const mat_x3_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x3 mult_trans mat_x3, column 1 of result ### \n"
-            "vmulay ACC, col0, mat0 \n"
-            "vmadday ACC, col1, mat1 \n"
-            "vmaddy res1, col2, mat2 \n"
-            : "=&j res1"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[mat0_val], $vf4        \n"
+            "qmtc2      %[mat1_val], $vf5        \n"
+            "qmtc2      %[mat2_val], $vf6        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulay     $ACC, $vf1, $vf4         \n"
+            "vmadday    $ACC, $vf2, $vf5         \n"
+            "vmaddy     $vf7, $vf3, $vf6         \n"
+            "qmfc2      %[result], $vf7          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -222,17 +303,28 @@ class mat_x3_template {
     mult_trans_col2(const mat_x3_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x3 mult_trans mat_x3, column 2 of result ### \n"
-            "vmulaz ACC, col0, mat0 \n"
-            "vmaddaz ACC, col1, mat1 \n"
-            "vmaddz res2, col2, mat2 \n"
-            : "=&j res2"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[mat0_val], $vf4        \n"
+            "qmtc2      %[mat1_val], $vf5        \n"
+            "qmtc2      %[mat2_val], $vf6        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulaz     $ACC, $vf1, $vf4         \n"
+            "vmaddaz    $ACC, $vf2, $vf5         \n"
+            "vmaddz     $vf7, $vf3, $vf6         \n"
+            "qmfc2      %[result], $vf7          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -241,17 +333,28 @@ class mat_x3_template {
     mult_trans_col3(const mat_x3_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x3 mult_trans mat_x3, column 3 of result ### \n"
-            "vmulaw ACC, col0, mat0 \n"
-            "vmaddaw ACC, col1, mat1 \n"
-            "vmaddw res3, col2, mat2 \n"
-            : "=&j res3"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[mat0_val], $vf4        \n"
+            "qmtc2      %[mat1_val], $vf5        \n"
+            "qmtc2      %[mat2_val], $vf6        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulaw     $ACC, $vf1, $vf4         \n"
+            "vmaddaw    $ACC, $vf2, $vf5         \n"
+            "vmaddw     $vf7, $vf3, $vf6         \n"
+            "qmfc2      %[result], $vf7          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val)
+            : "memory");
         return column_type(result);
     }
 };
@@ -284,16 +387,18 @@ class mat_x4_template {
 
     void set_zero()
     {
-        asm(
-            " ### mat_x4::set_zero ### \n"
-            "vsub    col0, col0, col0 \n"
-            "vsub    col1, col1, col1 \n"
-            "vsub    col2, col2, col2 \n"
-            "vsub    col3, col3, col3 \n"
-            : "=j col0"(col0.vec128),
-            "=j col1"(col1.vec128),
-            "=j col2"(col2.vec128),
-            "=j col3"(col3.vec128));
+        asm __volatile__(
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "vsub       $vf2, $vf0, $vf0         \n"
+            "vsub       $vf3, $vf0, $vf0         \n"
+            "vsub       $vf4, $vf0, $vf0         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "=r"(col0.vec128), [col1] "=r"(col1.vec128), [col2] "=r"(col2.vec128), [col3] "=r"(col3.vec128)
+            :
+            : "memory");
     }
 
     void set_col0(column_type new_col) { col0 = new_col; }
@@ -303,62 +408,90 @@ class mat_x4_template {
 
     void set_row0(vec_4 new_row)
     {
-        asm(
-            " ### mat_x4::set_row0 ### \n"
-            "vaddx.x       col0, vf00, new_row \n"
-            "vaddy.x       col1, vf00, new_row \n"
-            "vaddz.x       col2, vf00, new_row \n"
-            "vaddw.x       col3, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128),
-            "+j col3"(col3.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[col3], $vf4            \n"
+            "qmtc2      %[new_row_val], $vf5     \n"
+            "vaddx.x    $vf1, $vf0, $vf5         \n"
+            "vaddy.x    $vf2, $vf0, $vf5         \n"
+            "vaddz.x    $vf3, $vf0, $vf5         \n"
+            "vaddw.x    $vf4, $vf0, $vf5         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     void set_row1(vec_4 new_row)
     {
-        asm(
-            " ### mat_x4::set_row1 ### \n"
-            "vaddx.y       col0, vf00, new_row \n"
-            "vaddy.y       col1, vf00, new_row \n"
-            "vaddz.y       col2, vf00, new_row \n"
-            "vaddw.y       col3, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128),
-            "+j col3"(col3.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[col3], $vf4            \n"
+            "qmtc2      %[new_row_val], $vf5     \n"
+            "vaddx.y    $vf1, $vf0, $vf5         \n"
+            "vaddy.y    $vf2, $vf0, $vf5         \n"
+            "vaddz.y    $vf3, $vf0, $vf5         \n"
+            "vaddw.y    $vf4, $vf0, $vf5         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     void set_row2(vec_4 new_row)
     {
-        asm(
-            " ### mat_x4::set_row2 ### \n"
-            "vaddx.z        col0, vf00, new_row \n"
-            "vaddy.z        col1, vf00, new_row \n"
-            "vaddz.z        col2, vf00, new_row \n"
-            "vaddw.z        col3, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128),
-            "+j col3"(col3.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[col3], $vf4            \n"
+            "qmtc2      %[new_row_val], $vf5     \n"
+            "vaddx.z    $vf1, $vf0, $vf5         \n"
+            "vaddy.z    $vf2, $vf0, $vf5         \n"
+            "vaddz.z    $vf3, $vf0, $vf5         \n"
+            "vaddw.z    $vf4, $vf0, $vf5         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     void set_row3(vec_4 new_row)
     {
-        asm(
-            " ### mat_x4::set_row3 ### \n"
-            "vmulx.w        col0, vf00, new_row \n"
-            "vmuly.w        col1, vf00, new_row \n"
-            "vmulz.w        col2, vf00, new_row \n"
-            "vmulw.w        col3, vf00, new_row \n"
-            : "+j col0"(col0.vec128),
-            "+j col1"(col1.vec128),
-            "+j col2"(col2.vec128),
-            "+j col3"(col3.vec128)
-            : "j new_row"(new_row.vec128));
+        vec128_t new_row_val = new_row.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0], $vf1            \n"
+            "qmtc2      %[col1], $vf2            \n"
+            "qmtc2      %[col2], $vf3            \n"
+            "qmtc2      %[col3], $vf4            \n"
+            "qmtc2      %[new_row_val], $vf5     \n"
+            "vmulx.w    $vf1, $vf0, $vf5         \n"
+            "vmuly.w    $vf2, $vf0, $vf5         \n"
+            "vmulz.w    $vf3, $vf0, $vf5         \n"
+            "vmulw.w    $vf4, $vf0, $vf5         \n"
+            "qmfc2      %[col0], $vf1            \n"
+            "qmfc2      %[col1], $vf2            \n"
+            "qmfc2      %[col2], $vf3            \n"
+            "qmfc2      %[col3], $vf4            \n"
+            : [col0] "+r"(col0.vec128), [col1] "+r"(col1.vec128), [col2] "+r"(col2.vec128), [col3] "+r"(col3.vec128)
+            : [new_row_val] "r"(new_row_val)
+            : "memory");
     }
 
     column_type get_col0() const { return col0; }
@@ -369,68 +502,96 @@ class mat_x4_template {
     vec_4 get_row0() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x4::get_row0 ### \n"
-            "vaddx.x row_, vf00, col0 \n"
-            "vaddx.y row_, vf00, col1 \n"
-            "vaddx.z row_, vf00, col2 \n"
-            "vmulx.w row_, vf00, col3 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "vsub       $vf5, $vf0, $vf0         \n"
+            "vaddx.x $vf5, $vf0, $vf1        \n"
+            "vaddx.y $vf5, $vf0, $vf2        \n"
+            "vaddx.z $vf5, $vf0, $vf3        \n"
+            "vmulx.w $vf5, $vf0, $vf4        \n"
+            "qmfc2      %[row], $vf5             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val)
+            : "memory");
         return vec_4(row);
     }
 
     vec_4 get_row1() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x4::get_row1 ### \n"
-            "vaddy.x row_, vf00, col0 \n"
-            "vaddy.y row_, vf00, col1 \n"
-            "vaddy.z row_, vf00, col2 \n"
-            "vmuly.w row_, vf00, col3 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "vsub       $vf5, $vf0, $vf0         \n"
+            "vaddy.x $vf5, $vf0, $vf1        \n"
+            "vaddy.y $vf5, $vf0, $vf2        \n"
+            "vaddy.z $vf5, $vf0, $vf3        \n"
+            "vmuly.w $vf5, $vf0, $vf4        \n"
+            "qmfc2      %[row], $vf5             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val)
+            : "memory");
         return vec_4(row);
     }
 
     vec_4 get_row2() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x4::get_row2 ### \n"
-            "vaddz.x row_, vf00, col0 \n"
-            "vaddz.y row_, vf00, col1 \n"
-            "vaddz.z row_, vf00, col2 \n"
-            "vmulz.w row_, vf00, col3 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "vsub       $vf5, $vf0, $vf0         \n"
+            "vaddz.x $vf5, $vf0, $vf1        \n"
+            "vaddz.y $vf5, $vf0, $vf2        \n"
+            "vaddz.z $vf5, $vf0, $vf3        \n"
+            "vmulz.w $vf5, $vf0, $vf4        \n"
+            "qmfc2      %[row], $vf5             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val)
+            : "memory");
         return vec_4(row);
     }
 
     vec_4 get_row3() const
     {
         vec128_t row;
-        asm(
-            " ### mat_x4::get_row3 ### \n"
-            "vaddw.x row_, vf00, col0 \n"
-            "vaddw.y row_, vf00, col1 \n"
-            "vaddw.z row_, vf00, col2 \n"
-            "vmulw.w row_, vf00, col3 \n"
-            : "=&j row_"(row)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "vsub       $vf5, $vf0, $vf0         \n"
+            "vaddw.x $vf5, $vf0, $vf1        \n"
+            "vaddw.y $vf5, $vf0, $vf2        \n"
+            "vaddw.z $vf5, $vf0, $vf3        \n"
+            "vmulw.w    $vf5, $vf0, $vf4        \n"
+            "qmfc2      %[row], $vf5             \n"
+            : [row] "=&r"(row)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val)
+            : "memory");
         return vec_4(row);
     }
 
@@ -438,16 +599,26 @@ class mat_x4_template {
     operator*(vec_4 vec) const
     {
         vec128_t result;
-        asm(
-            " ### mat_x4 * vec_4 ### \n"
-            "vmulax    ACC, col0, vec      \n"
-            "vmadday    ACC, col1, vec      \n"
-            "vmaddaz    ACC, col2, vec      \n"
-            "vmaddw    result, col3, vec      \n"
-            : "=&j result"(result), "=r"(vu0_ACC)
-            : "j vec"(vec.vec128),
-            "j col0"(col0.vec128), "j col1"(col1.vec128),
-            "j col2"(col2.vec128), "j col3"(col3.vec128));
+        vec128_t vec_val = vec.vec128;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[vec_val], $vf5         \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulax     $ACC, $vf1, $vf5         \n"
+            "vmadday    $ACC, $vf2, $vf5         \n"
+            "vmaddaz    $ACC, $vf3, $vf5         \n"
+            "vmaddw     $vf6, $vf4, $vf5         \n"
+            "qmfc2      %[result], $vf6          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [vec_val] "r"(vec_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -455,14 +626,23 @@ class mat_x4_template {
     operator*(vector_t vec) const
     {
         vec128_t result;
-        asm(
-            " ### mat_x4 * vector_t ### \n"
-            "vmulax    ACC, col0, vec      \n"
-            "vmadday    ACC, col1, vec      \n"
-            "vmaddz    result, col2, vec      \n"
-            : "=&j result"(result), "=r"(vu0_ACC)
-            : "j vec"(vec.vec128),
-            "j col0"(col0.vec128), "j col1"(col1.vec128), "j col2"(col2.vec128));
+        vec128_t vec_val = vec.vec128;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[vec_val], $vf4         \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulax     $ACC, $vf1, $vf4         \n"
+            "vmadday    $ACC, $vf2, $vf4         \n"
+            "vmaddz     $vf5, $vf3, $vf4         \n"
+            "qmfc2      %[result], $vf5          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [vec_val] "r"(vec_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -470,16 +650,26 @@ class mat_x4_template {
     operator*(point_t pt) const
     {
         vec128_t result;
-        asm(
-            " ### mat_x4 * point_t ### \n"
-            "vmulax    ACC, col0, pt      \n"
-            "vmadday    ACC, col1, pt      \n"
-            "vmaddaz    ACC, col2, pt      \n"
-            "vmaddw    result, col3, vf00    \n"
-            : "=&j result"(result), "=r"(vu0_ACC)
-            : "j pt"(pt.vec128),
-            "j col0"(col0.vec128), "j col1"(col1.vec128),
-            "j col2"(col2.vec128), "j col3"(col3.vec128));
+        vec128_t pt_val = pt.vec128;
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[pt_val], $vf5          \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulax     $ACC, $vf1, $vf5         \n"
+            "vmadday    $ACC, $vf2, $vf5         \n"
+            "vmaddaz    $ACC, $vf3, $vf5         \n"
+            "vmaddw     $vf6, $vf4, $vf0         \n"
+            "qmfc2      %[result], $vf6          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [pt_val] "r"(pt_val), [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -488,20 +678,33 @@ class mat_x4_template {
     mult_trans_col0(const mat_x4_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x4 mult_trans mat_x4, column 0 of result ### \n"
-            "vmulax ACC, col0, mat0 \n"
-            "vmaddax ACC, col1, mat1 \n"
-            "vmaddax ACC, col2, mat2 \n"
-            "vmaddx res0, col3, mat3 \n"
-            : "=&j res0"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128),
-            "j mat3"(mat.col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        vec128_t mat3_val = mat.col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[mat0_val], $vf5        \n"
+            "qmtc2      %[mat1_val], $vf6        \n"
+            "qmtc2      %[mat2_val], $vf7        \n"
+            "qmtc2      %[mat3_val], $vf8        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulax     $ACC, $vf1, $vf5         \n"
+            "vmaddax    $ACC, $vf2, $vf6         \n"
+            "vmaddax    $ACC, $vf3, $vf7         \n"
+            "vmaddx     $vf9, $vf4, $vf8         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -510,20 +713,33 @@ class mat_x4_template {
     mult_trans_col1(const mat_x4_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x4 mult_trans mat_x4, column 1 of result ### \n"
-            "vmulay ACC, col0, mat0 \n"
-            "vmadday ACC, col1, mat1 \n"
-            "vmadday ACC, col2, mat2 \n"
-            "vmaddy res1, col3, mat3 \n"
-            : "=&j res1"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128),
-            "j mat3"(mat.col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        vec128_t mat3_val = mat.col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[mat0_val], $vf5        \n"
+            "qmtc2      %[mat1_val], $vf6        \n"
+            "qmtc2      %[mat2_val], $vf7        \n"
+            "qmtc2      %[mat3_val], $vf8        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulay     $ACC, $vf1, $vf5         \n"
+            "vmadday    $ACC, $vf2, $vf6         \n"
+            "vmadday    $ACC, $vf3, $vf7         \n"
+            "vmaddy     $vf9, $vf4, $vf8         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -532,20 +748,33 @@ class mat_x4_template {
     mult_trans_col2(const mat_x4_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x4 mult_trans mat_x4, column 2 of result ### \n"
-            "vmulaz ACC, col0, mat0 \n"
-            "vmaddaz ACC, col1, mat1 \n"
-            "vmaddaz ACC, col2, mat2 \n"
-            "vmaddz res2, col3, mat3 \n"
-            : "=&j res2"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128),
-            "j mat3"(mat.col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        vec128_t mat3_val = mat.col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[mat0_val], $vf5        \n"
+            "qmtc2      %[mat1_val], $vf6        \n"
+            "qmtc2      %[mat2_val], $vf7        \n"
+            "qmtc2      %[mat3_val], $vf8        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulaz     $ACC, $vf1, $vf5         \n"
+            "vmaddaz    $ACC, $vf2, $vf6         \n"
+            "vmaddaz    $ACC, $vf3, $vf7         \n"
+            "vmaddz     $vf9, $vf4, $vf8         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val)
+            : "memory");
         return column_type(result);
     }
 
@@ -554,20 +783,33 @@ class mat_x4_template {
     mult_trans_col3(const mat_x4_template<mat_column_type>& mat) const
     {
         vec128_t result;
-        asm("### mat_x4 mult_trans mat_x4, column 3 of result ### \n"
-            "vmulaw ACC, col0, mat0 \n"
-            "vmaddaw ACC, col1, mat1 \n"
-            "vmaddaw ACC, col2, mat2 \n"
-            "vmaddw res3, col3, mat3 \n"
-            : "=&j res3"(result), "=r"(vu0_ACC)
-            : "j col0"(col0.vec128),
-            "j col1"(col1.vec128),
-            "j col2"(col2.vec128),
-            "j col3"(col3.vec128),
-            "j mat0"(mat.col0.vec128),
-            "j mat1"(mat.col1.vec128),
-            "j mat2"(mat.col2.vec128),
-            "j mat3"(mat.col3.vec128));
+        vec128_t col0_val = col0.vec128;
+        vec128_t col1_val = col1.vec128;
+        vec128_t col2_val = col2.vec128;
+        vec128_t col3_val = col3.vec128;
+        vec128_t mat0_val = mat.col0.vec128;
+        vec128_t mat1_val = mat.col1.vec128;
+        vec128_t mat2_val = mat.col2.vec128;
+        vec128_t mat3_val = mat.col3.vec128;
+        asm __volatile__(
+            "qmtc2      %[col0_val], $vf1        \n"
+            "qmtc2      %[col1_val], $vf2        \n"
+            "qmtc2      %[col2_val], $vf3        \n"
+            "qmtc2      %[col3_val], $vf4        \n"
+            "qmtc2      %[mat0_val], $vf5        \n"
+            "qmtc2      %[mat1_val], $vf6        \n"
+            "qmtc2      %[mat2_val], $vf7        \n"
+            "qmtc2      %[mat3_val], $vf8        \n"
+            "vsuba      $ACC, $vf0, $vf0         \n"
+            "vmulaw     $ACC, $vf1, $vf5         \n"
+            "vmaddaw    $ACC, $vf2, $vf6         \n"
+            "vmaddaw    $ACC, $vf3, $vf7         \n"
+            "vmaddw     $vf9, $vf4, $vf8         \n"
+            "qmfc2      %[result], $vf9          \n"
+            : [result] "=&r"(result), [acc] "=r"(vu0_ACC)
+            : [col0_val] "r"(col0_val), [col1_val] "r"(col1_val), [col2_val] "r"(col2_val), [col3_val] "r"(col3_val),
+              [mat0_val] "r"(mat0_val), [mat1_val] "r"(mat1_val), [mat2_val] "r"(mat2_val), [mat3_val] "r"(mat3_val)
+            : "memory");
         return column_type(result);
     }
 };
diff --git a/include/ps2s/vector.h b/include/ps2s/vector.h
index f4d5b83..1a46b51 100644
--- a/include/ps2s/vector.h
+++ b/include/ps2s/vector.h
@@ -211,25 +211,11 @@ class vec_x : public vec_template<x_base> {
 
     inline vec_x(float x)
     {
-        asm(
-            " ### init vec_x with a float ###       \n"
-            "  .if %A0                              \n"
-            "  move %[_this], %[new_x]              \n"
-            "  .endif                               \n"
-            "                                       \n"
-            "  .if %A1                              \n"
-            "  lw %[_this], %[new_x]                \n"
-            "  .endif                               \n"
-            "                                       \n"
-            "  .if %A2                              \n"
-            "  mfc1 %[_this], %[new_x]              \n"
-            "  .endif                               \n"
-            "                                       \n"
-            "  .if %A3                              \n"
-            "  qmtc2 %[new_x], %[_this]             \n"
-            "  .endif                               \n"
-            : [_this] "=r,r,r,j"(vec128)
-            : [new_x] "r,m,f,r"(x));
+        vec128_t temp;
+        asm __volatile__(
+            "mfc1       %[vec128], %[x]              \n"
+            : [vec128] "=r"(vec128)
+            : [x] "f"(x));
     }
 
     explicit inline vec_x(const vec_xyz rhs);
@@ -240,25 +226,14 @@ class vec_x : public vec_template<x_base> {
     inline operator float() const
     {
         float new_float;
-        asm(
-            " ### vec_x to float ###                \n"
-            "  .if %A0                              \n"
-            "  move %[new_float], %[_this]          \n"
-            "  .endif                               \n"
-            "                                       \n"
-            "  .if %A1                              \n"
-            "  sw %[_this], %[new_float]            \n"
-            "  .endif                               \n"
-            "                                       \n"
-            "  .if %A2                              \n"
-            "  mtc1 %[_this], %[new_float]          \n"
-            "  .endif                               \n"
-            "                                       \n"
-            "  .if %A3                              \n"
-            "  qmfc2 %[new_float], %[_this]         \n"
-            "  .endif                               \n"
-            : [new_float] "=r,m,f,r"(new_float)
-            : [_this] "r,r,r,j"(*this));
+        vec128_t this_val = this->vec128;
+        uint32_t x_bits;
+        asm __volatile__(
+            "move       %[x_bits], %[this_val]       \n"
+            : [x_bits] "=r"(x_bits)
+            : [this_val] "r"(this_val)
+            : "memory");
+        new_float = *reinterpret_cast<float*>(&x_bits);
         return new_float;
     }
 
@@ -367,13 +342,16 @@ class vec_y : public vec_template<y_base> {
 
     inline vec_y(float y)
     {
-        asm(
-            " ### init vec_y with a float ### \n"
-            "ctc2	%[f_y], $vi21 \n"
-            "vnop \n"
-            "vaddi.y	%[_this], vf00, I \n"
-            : [_this] "=j"(vec128)
-            : [f_y] "r"(y));
+        vec128_t result;
+        asm __volatile__(
+            "ctc2       %[f_y], $vi21            \n"
+            "vnop                                 \n"
+            "vaddi.y    $vf1, $vf0, $I           \n"
+            "qmfc2      %[result], $vf1          \n"
+            : [result] "=r"(result)
+            : [f_y] "r"(y)
+            : "memory");
+        vec128 = result;
     }
 
     explicit inline vec_y(const vec_xyz rhs);
@@ -490,13 +468,16 @@ class vec_z : public vec_template<z_base> {
 
     inline vec_z(float z)
     {
-        asm(
-            " ### init vec_z with a float ### \n"
-            "ctc2	%[f_z], $vi21 \n"
-            "vnop \n"
-            "vaddi.z	%[_this], vf00, I \n"
-            : [_this] "=j"(vec128)
-            : [f_z] "r"(z));
+        vec128_t result;
+        asm __volatile__(
+            "ctc2       %[f_z], $vi21            \n"
+            "vnop                                 \n"
+            "vaddi.z    $vf1, $vf0, $I           \n"
+            "qmfc2      %[result], $vf1          \n"
+            : [result] "=r"(result)
+            : [f_z] "r"(z)
+            : "memory");
+        vec128 = result;
     }
 
     explicit inline vec_z(const vec_xyz rhs);
@@ -613,13 +594,16 @@ class vec_w : public vec_template<w_base> {
 
     inline vec_w(float w)
     {
-        asm(
-            " ### init vec_w with a float ### \n"
-            "ctc2		%[f_w], $vi21 \n"
-            "vnop \n"
-            "vmuli.w	%[_this], vf00, I \n"
-            : [_this] "=j"(vec128)
-            : [f_w] "r"(w));
+        vec128_t result;
+        asm __volatile__(
+            "ctc2       %[f_w], $vi21            \n"
+            "vnop                                 \n"
+            "vmuli.w    $vf1, $vf0, $I           \n"
+            "qmfc2      %[result], $vf1          \n"
+            : [result] "=r"(result)
+            : [f_w] "r"(w)
+            : "memory");
+        vec128 = result;
     }
 
     explicit inline vec_w(const vec_xyzw rhs);
@@ -1082,15 +1066,19 @@ class vec_xy : public vec_template<xy_base> {
     inline vec_xy one_over() const
     {
         vec128_t result;
-        asm(" ### reciprocal of vec_xy ### \n"
-            "vdiv		Q, vf00w, %[_this]x \n"
-            "vwaitq \n"
-            "vaddq.x	%[result], vf00, Q \n"
-            "vdiv		Q, vf00w, %[_this]y \n"
-            "vwaitq \n"
-            "vaddq.y	%[result], vf00, Q \n"
-            : [result] "=&j"(result)
-            : [_this] "j"(*this));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vdiv       $Q, $vf0w, $vf1x        \n"
+            "vwaitq                              \n"
+            "vaddq.x    $vf2, $vf0, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf1y        \n"
+            "vwaitq                              \n"
+            "vaddq.y    $vf2, $vf0, $Q          \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val)
+            : "memory");
         return vec_xy(result);
     }
 
@@ -1330,18 +1318,22 @@ class vec_xyz : public vec_template<xyz_base> {
     inline vec_xyz one_over() const
     {
         vec128_t result;
-        asm(" ### reciprocal of vec_xyz ### \n"
-            "vdiv		Q, vf00w, %[_this]x \n"
-            "vwaitq \n"
-            "vaddq.x	%[result], vf00, Q \n"
-            "vdiv		Q, vf00w, %[_this]y \n"
-            "vwaitq \n"
-            "vaddq.y	%[result], vf00, Q \n"
-            "vdiv		Q, vf00w, %[_this]z \n"
-            "vwaitq \n"
-            "vaddq.z	%[result], vf00, Q \n"
-            : [result] "=&j"(result)
-            : [_this] "j"(*this));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vdiv       $Q, $vf0w, $vf1x        \n"
+            "vwaitq                              \n"
+            "vaddq.x    $vf2, $vf0, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf1y        \n"
+            "vwaitq                              \n"
+            "vaddq.y    $vf2, $vf0, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf1z        \n"
+            "vwaitq                              \n"
+            "vaddq.z    $vf2, $vf0, $Q          \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val)
+            : "memory");
         return vec_xyz(result);
     }
 
@@ -1376,14 +1368,20 @@ class vec_xyz : public vec_template<xyz_base> {
     dot(const vec_xyz rhs) const
     {
         vec128_t result, one;
-        asm(
-            " ### vec_xyz dot vec_xyz ### \n"
-            "vmul		%[result], %[lhs], %[rhs] \n"
-            "vaddw.x	%[one], vf00, vf00 \n"
-            "vadday.x	ACC, %[result], %[result] \n"
-            "vmaddz.x	%[result], %[one], %[result] \n"
-            : [result] "=j"(result), [one] "=j"(one)
-            : [lhs] "j"(*this), [rhs] "j"(rhs));
+        vec128_t lhs_val = this->vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1            \n"
+            "qmtc2      %[rhs_val], $vf2            \n"
+            "vmul       $vf3, $vf1, $vf2            \n"
+            "vaddw.x    $vf4, $vf0, $vf0            \n"
+            "vadday.x   $ACC, $vf3, $vf3            \n"
+            "vmaddz.x   $vf5, $vf4, $vf3            \n"
+            "qmfc2      %[result], $vf5             \n"
+            "qmfc2      %[one], $vf4                \n"
+            : [result] "=r"(result), [one] "=r"(one), [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return vec_x(result);
     }
 
@@ -1394,24 +1392,28 @@ class vec_xyz : public vec_template<xyz_base> {
     {
         vec128_t result, dot, one;
         int cond;
-        asm(
-            "### vec_xyz normalized ###\n"
-            "vmul	%[dot], %[_this], %[_this] \n"
-            "vaddw.x	%[one], vf00, vf00 \n"
-            "vadday.x	ACC, %[dot], %[dot] \n"
-            "vmaddz.x	%[dot], %[one], %[dot] \n"
-            "vrsqrt	Q, vf00w, %[dot]x\n"
-            "cfc2	%[cond], $vi17\n"
-            "vsub	%[result], %[result], %[result]\n"
-            "andi	%[cond], %[cond], 8\n"
-            "bgtz	%[cond], 0f \n"
-            "nop\n"
-            "vwaitq\n"
-            "vmulq	%[result], %[_this], Q\n"
-            "0:\n"
-            : [result] "=&j"(result), [one] "=&j"(one), [dot] "=&j"(dot), [cond] "=r"(cond)
-            : [_this] "j"(*this));
-
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1            \n"
+            "vmul       $vf2, $vf1, $vf1             \n"
+            "vaddw.x    $vf3, $vf0, $vf0             \n"
+            "vadday.x   $ACC, $vf2, $vf2             \n"
+            "vmaddz.x   $vf2, $vf3, $vf2             \n"
+            "vrsqrt $Q, $vf0w, $vf2x             \n"
+            "cfc2       %[cond], $vi17               \n"
+            "vsub       $vf4, $vf0, $vf0             \n"
+            "andi       %[cond], %[cond], 8          \n"
+            "bgtz       %[cond], 0f                   \n"
+            "nop                                     \n"
+            "vwaitq                                  \n"
+            "vmulq      $vf4, $vf1, $Q               \n"
+            "0:                                      \n"
+            "qmfc2      %[result], $vf4              \n"
+            "qmfc2      %[dot], $vf2                 \n"
+            "qmfc2      %[one], $vf3                 \n"
+            : [result] "=&r"(result), [one] "=&r"(one), [dot] "=&r"(dot), [cond] "=r"(cond)
+            : [this_val] "r"(this_val)
+            : "memory");
         return vec_xyz(result);
     }
 
@@ -1419,12 +1421,17 @@ class vec_xyz : public vec_template<xyz_base> {
     cross(const vec_xyz rhs) const
     {
         vec128_t result;
-        asm(
-            " ### vec_xyz cross vec_xyz ### \n"
-            "vopmula.xyz	ACC, %[lhs], %[rhs] \n"
-            "vopmsub.xyz	%[result], %[rhs], %[lhs] \n"
-            : [result] "=j"(result)
-            : [lhs] "j"(*this), [rhs] "j"(rhs));
+        vec128_t lhs_val = this->vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1            \n"
+            "qmtc2      %[rhs_val], $vf2            \n"
+            "vopmula    $ACC, $vf1, $vf2            \n"
+            "vopmsub    $vf3, $vf2, $vf1            \n"
+            "qmfc2      %[result], $vf3             \n"
+            : [result] "=r"(result), [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return vec_xyz(result);
     }
 
@@ -1725,21 +1732,25 @@ class vec_xyzw : public vec_template<xyzw_base> {
     inline vec_xyzw one_over() const
     {
         vec128_t result;
-        asm(" ### reciprocal of vec_xyzw ### \n"
-            "vdiv		Q, vf00w, %[_this]x \n"
-            "vwaitq \n"
-            "vaddq.x	%[result], vf00, Q \n"
-            "vdiv		Q, vf00w, %[_this]y \n"
-            "vwaitq \n"
-            "vaddq.y	%[result], vf00, Q \n"
-            "vdiv		Q, vf00w, %[_this]z \n"
-            "vwaitq \n"
-            "vaddq.z	%[result], vf00, Q \n"
-            "vdiv		Q, vf00w, %[_this]w \n"
-            "vwaitq \n"
-            "vmulq.w	%[result], vf00, Q \n"
-            : [result] "=&j"(result)
-            : [_this] "j"(*this));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vdiv       $Q, $vf0w, $vf1x        \n"
+            "vwaitq                              \n"
+            "vaddq.x    $vf2, $vf0, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf1y        \n"
+            "vwaitq                              \n"
+            "vaddq.y    $vf2, $vf0, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf1z        \n"
+            "vwaitq                              \n"
+            "vaddq.z    $vf2, $vf0, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf1w        \n"
+            "vwaitq                              \n"
+            "vmulq.w    $vf2, $vf0, $Q          \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val)
+            : "memory");
         return vec_xyzw(result);
     }
 
@@ -1807,10 +1818,10 @@ class vec_xyzw : public vec_template<xyzw_base> {
         asm(
             "### vec_xyzw normalized ###\n"
             "vmul	%[dot], %[_this], %[_this] \n"
-            "vaddw.x	%[one, vf00, vf00 \n"
+            "vaddw.x	%[one], vf00, vf00 \n"
             "vadday.x	ACC, %[dot], %[dot] \n"
             "vmaddaz.x	ACC, %[one], %[dot] \n"
-            "vmaddw.x	%[dot, %[one, %[dot] \n"
+            "vmaddw.x	%[dot], %[one], %[dot] \n"
             "vrsqrt	Q, vf00w, %[dot]x\n"
             "cfc2	%[cond], $vi17\n"
             "vsub	%[result], %[result], %[result]\n"
@@ -2330,13 +2341,13 @@ class vector_t : public vec_template<vector_base> {
         vec128_t temp0;
         asm volatile(
             "mtsab		$0, 4				# get ready to shift right 4 bytes	\n"
-            "mtc1		%[_vec, %[_x]			# x = value.x	\n"
+            "mtc1		%[_vec], %[_x]			# x = value.x	\n"
             "qfsrv		%[_temp0], $0, %[_vec]		# temp0 = value >> 8	\n"
             "mtc1		%[_temp0], %[_y]		# y = value.y \n"
             "qfsrv		%[_temp0], $0, %[_temp0]	# temp0 >>= 8 \n"
             "mtc1		%[_temp0], %[_z]		# z = value.z \n"
             : [_x] "=f"(x), [_y] "=f"(y), [_z] "=f"(z), [_temp0] "=r"(temp0)
-            : [_vec] "r"(vec128));
+            : [_vec] "r"(this->vec128));
 
         printf("(%f %f %f (0.0))\n", x, y, z);
     }
diff --git a/include/ps2s/vector_common.h b/include/ps2s/vector_common.h
index febc4a8..2a1994a 100644
--- a/include/ps2s/vector_common.h
+++ b/include/ps2s/vector_common.h
@@ -7,6 +7,9 @@
 #ifndef vector_common_h
 #define vector_common_h
 
+#include <stdint.h>
+#include <stdio.h>
+
 /********************************************
  * This file contains the class 'vec_template' which provides all vector
  * methods that can be generalized, as well as the '*_base' classes that
@@ -65,8 +68,8 @@
  */
 
 register int vu0_ACC asm("$0");
-// let's use vf0 for this one to avoid a compiler warning
-register int vu0_Q asm("vf0") __attribute__((mode(TI)));
+// Q register is accessed via cfc2/ctc2 instructions, not as a global register
+// We'll handle it differently in inline asm
 
 /********************************************
  * constants
@@ -175,11 +178,11 @@ class vec_template : public base_vec {
     // constructors
 
     inline vec_template() {}
-    inline vec_template(const vec128_t initVal) { vec128 = initVal; }
+    inline vec_template(const vec128_t initVal) { this->vec128 = initVal; }
 
     // accessors
 
-    inline vec128_t get128() const { return vec128; }
+    inline vec128_t get128() const { return this->vec128; }
 
     // mutators
 
@@ -188,14 +191,17 @@ class vec_template : public base_vec {
     inline void
     set(const rhs_type rhs)
     {
-        asm(" ### set vec_l_fields = vec_r_fields ### \n"
-            "vmove.mask	this, rhs \n"
-            : "=j this"(*this)
-            : "j rhs"(rhs),
-            "0"(*this),
-            "vu_mask l_fields"(this->valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields),
-            "vu_mask mask"(this->valid_fields & rhs.valid_fields));
+        vec128_t rhs_val = rhs.vec128;
+        unsigned int mask = this->valid_fields & rhs.valid_fields;
+        asm __volatile__(
+            "qmtc2      %[rhs_val], $vf1        \n"
+            "qmtc2      %[this_val], $vf2       \n"
+            "vsub       $vf2, $vf0, $vf0        \n"
+            "vmove      $vf2, $vf1              \n"
+            "qmfc2      %[this_val], $vf2       \n"
+            : [this_val] "+r"(this->vec128)
+            : [rhs_val] "r"(rhs_val), [mask] "r"(mask)
+            : "memory");
     }
     // make sure that for vectors of the same type it's a simple assignment
     inline void
@@ -207,22 +213,23 @@ class vec_template : public base_vec {
     inline void
     set(float rhs)
     {
-        asm("### set all vec_fields fields to float ### \n"
-            "pextlw rhs, rhs, rhs \n"
-            "pextlw this, rhs, rhs \n"
-            : "=r this"(*this)
-            : "r rhs"(rhs));
+        asm __volatile__(
+            "pextlw     %[this_val], %[rhs], %[rhs] \n"
+            "pextlw     %[this_val], %[this_val], %[this_val] \n"
+            : [this_val] "+r"(this->vec128)
+            : [rhs] "r"(*reinterpret_cast<uint32_t*>(&rhs)));
     }
 
     inline void
     set_zero()
     {
-
-        asm(" ### vec_l_fields set_zero ### \n"
-            "vsub.mask this, vf00, vf00 \n"
-            : "=j this"(*this)
-            : "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "qmfc2      %[this_val], $vf1        \n"
+            : [this_val] "+r"(this->vec128)
+            :
+            : "memory");
     }
 
     static const unsigned int fields = (base_vec::broadcast_field << 4) | base_vec::valid_fields;
@@ -241,30 +248,33 @@ class vec_template : public base_vec {
     {
         const lhs_type& lhs = *this;
         vec128_t result;
-        asm(
-            " ### vec_l_fields + vec_r_fields ### \n"
-            "vadd_bc._mask	result, lhs, rhs \n"
-            : "=j result"(result)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vadd       $vf3, $vf1, $vf2        \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return lhs_type(result);
     }
     inline lhs_type
     operator+(float rhs) const
     {
         vec128_t result;
-        asm(
-            " ### vec_l_fields + float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vaddi.mask	result, lhs, I \n"
-            : "=j result"(result)
-            : "r rhs"(rhs),
-            "j lhs"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t lhs_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21      \n"
+            "vnop                               \n"
+            "vaddi      $vf2, $vf1, $I          \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
         return lhs_type(result);
     }
     template <class rhs_type>
@@ -280,14 +290,16 @@ class vec_template : public base_vec {
     {
         const lhs_type& lhs = *this;
         vec128_t result;
-        asm(
-            " ### vec_l_fields - vec_r_fields ### \n"
-            "vsub_bc._mask %0, %1, %2 \n"
-            : "=j"(result)
-            : "j"(lhs), "j"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vsub       $vf3, $vf1, $vf2        \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return lhs_type(result);
     }
     template <class rhs_type>
@@ -303,32 +315,33 @@ class vec_template : public base_vec {
     {
         const lhs_type& lhs = *this;
         vec128_t result;
-        asm(
-            " ### vec_l_fields * vec_r_fields ### \n"
-            "vmul_bc._mask  %0, %1, %2 \n"
-            : "=j"(result)
-            : "j"(lhs), "j"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
-
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vmul       $vf3, $vf1, $vf2        \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return lhs_type(result);
     }
     inline lhs_type
     operator*(float rhs) const
     {
         vec128_t result;
-        asm(
-            " ### vec_l_fields * float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vmuli.mask	result, this, I \n"
-            : "=j result"(result)
-            : "r rhs"(rhs),
-            "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields)
-            : "vf31");
+        vec128_t this_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21       \n"
+            "vnop                               \n"
+            "vmuli      $vf2, $vf1, $I           \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
         return lhs_type(result);
     }
     template <class rhs_type>
@@ -344,15 +357,18 @@ class vec_template : public base_vec {
     {
         const lhs_type& lhs = *this;
         vec128_t result;
-        asm(
-            " ### vec_l_fields / vec__r_bc ### \n"
-            "vdiv		Q, vf00w, rhs_r_bc \n"
-            "vwaitq \n"
-            "vmulq.l_fields	result, lhs, Q \n"
-            : "=j result"(result)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_bc _r_bc"(rhs.broadcast_field));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vdiv       $Q, $vf0w, $vf2x        \n"
+            "vwaitq                             \n"
+            "vmulq      $vf3, $vf1, $Q          \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return lhs_type(result);
     }
     inline lhs_type
@@ -376,15 +392,15 @@ class vec_template : public base_vec {
     operator-() const
     {
         vec128_t result;
-        asm(
-            " ### negate vec_l_fields ### \n"
-            "vsuba.mask ACC, vf00, vf00 \n"
-            "vmsubw.mask result, this, vf00w \n"
-
-            : "=j result"(result), "=r"(vu0_ACC)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vsuba      $ACC, $vf0, $vf0        \n"
+            "vmsubw     $vf2, $vf1, $vf0w       \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result), "=r"(vu0_ACC)
+            : [this_val] "r"(this_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -394,21 +410,21 @@ class vec_template : public base_vec {
     no_w_negate() const
     {
         vec128_t result;
-        asm(
-            " ### negate vec_l_fields (no w) ### \n"
-            "vsub.mask result, vf00, this \n"
-
-            : "=j result"(result)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vsub       $vf2, $vf0, $vf1        \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val)
+            : "memory");
         return lhs_type(result);
     }
 
     // add and multiply-add functions.
     // You can write the accumulator in one method and add or
     // madd to it in another - data dependency is respected
-    // due to the vu0_ACC global (see declaration above)�
+    // due to the vu0_ACC global (see declaration above)�
     // However, you are responsible for matching the type
     // written to and read from the accumulator - use with
     // care.
@@ -418,13 +434,31 @@ class vec_template : public base_vec {
     inline void
     to_a() const
     {
-        asm(
-            " ### ACC = vec_fields ### \n"
-            "vmulaw.mask ACC, this, vf00 \n"
-            : "=r"(vu0_ACC)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask fields"(this->valid_fields));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            // Set ACC to this vector: ACC = vf1
+            // Use vmulaw: ACC = ACC + vf1 * vf0w = ACC + vf1 * 1.0
+            // To set ACC = vf1 (not add), we need ACC to be 0 first
+            // Use vmula to set ACC = vf1 * (1,1,1,1)
+            // Create (1,1,1,1) by using vf0w which is 1.0
+            // Actually, vmula sets ACC = src1 * src2 (not ACC + src1 * src2)
+            // So vmula $ACC, $vf1, $vf0 does: ACC = vf1 * vf0
+            // But vf0 is (0,0,0,1), so we'd get (0,0,0,vf1.w)
+            // Need to use vmove to broadcast vf0w or use a different approach
+            // Let's use vmulaw with zero ACC: if ACC is 0, then ACC = 0 + vf1 * 1.0 = vf1
+            // But ACC might not be 0. Let's use vmula with a vector of ones
+            // Actually, simplest: use vmove to copy vf1 to a temp, then use vmula
+            "vmove      $vf2, $vf1               \n"  // vf2 = vf1
+            "vmulaw     $ACC, $vf2, $vf0         \n"  // ACC = ACC + vf2 * vf0w = ACC + vf1 * 1.0
+            // If ACC is not zero, this won't work. Let's zero ACC first using vmula with zero
+            "vsub       $vf3, $vf0, $vf0         \n"  // vf3 = 0
+            "vmula      $ACC, $vf3, $vf3         \n"  // ACC = vf3 * vf3 = 0 * 0 = 0
+            "vmulaw     $ACC, $vf1, $vf0         \n"  // ACC = ACC + vf1 * vf0w = 0 + vf1 * 1.0 = vf1
+            "cfc2       %[acc], $vi16            \n"
+            : [acc] "=r"(vu0_ACC)
+            : [this_val] "r"(this_val)
+            : "memory");
     }
 
     // from_a (from accumulator): this = ACC
@@ -432,13 +466,15 @@ class vec_template : public base_vec {
     inline void
     from_a()
     {
-        asm(
-            " ### vec_fields = ACC ### \n"
-            "vmaddx.mask this, vf00, vf00 \n"
-            : "=j this"(*this)
-            : "r"(vu0_ACC),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask fields"(this->valid_fields));
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            // Read ACC: use vmaddw with zero to get ACC + 0 = ACC
+            "vsub       $vf1, $vf0, $vf0         \n"
+            "vmaddw     $vf1, $vf1, $vf0         \n"  // vf1 = ACC + vf1*vf0 = ACC + 0 = ACC
+            "qmfc2      %[this_val], $vf1        \n"
+            : [this_val] "+r"(this->vec128)
+            : [acc] "r"(vu0_ACC)
+            : "memory");
     }
 
     // mula (multiply, to accumulator): ACC = this * rhs
@@ -448,29 +484,32 @@ class vec_template : public base_vec {
     mula(rhs_type rhs) const
     {
         const lhs_type& lhs = *this;
-        asm(
-            " ### ACC = vec_l_fields * vec_r_fields ### \n"
-            "vmula_bc._mask ACC, lhs, rhs \n"
-            : "=r"(vu0_ACC)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vmula      $ACC, $vf1, $vf2        \n"
+            "cfc2       %[acc], $vi16           \n"
+            : [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
     }
 
     inline void
     mula(float rhs) const
     {
-        asm(
-            " ### ACC = vec_l_fields * float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vmulai.mask ACC, lhs, I \n"
-            : "=r"(vu0_ACC)
-            : "r rhs"(rhs),
-            "j lhs"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t lhs_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21     \n"
+            "vnop                               \n"
+            "vmulai     $ACC, $vf1, $I          \n"
+            "cfc2       %[acc], $vi16           \n"
+            : [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
     }
 
     // aadd (accumulator add): result = ACC + this
@@ -480,15 +519,15 @@ class vec_template : public base_vec {
     aadd() const
     {
         vec128_t result;
-        asm(
-            " ### ACC + vec_l_fields ### \n"
-            "vmaddw.mask result, this, vf00 \n"
-
-            : "=j result"(result)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "r"(vu0_ACC));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            "vmaddw     $vf2, $vf1, $vf0         \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [acc] "r"(vu0_ACC)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -496,15 +535,15 @@ class vec_template : public base_vec {
     asub() const
     {
         vec128_t result;
-        asm(
-            " ### ACC - vec_l_fields ### \n"
-            "vmsubw.mask result, this, vf00 \n"
-
-            : "=j result"(result)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "r"(vu0_ACC));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            "vmsubw     $vf2, $vf1, $vf0         \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [acc] "r"(vu0_ACC)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -516,29 +555,32 @@ class vec_template : public base_vec {
     adda(rhs_type rhs) const
     {
         const lhs_type& lhs = *this;
-        asm(
-            " ### ACC = vec_l_fields + vec_r_fields ### \n"
-            "vadda_bc._mask ACC, lhs, rhs \n"
-            : "=r"(vu0_ACC)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vadda      $ACC, $vf1, $vf2        \n"
+            "cfc2       %[acc], $vi16           \n"
+            : [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
     }
 
     inline void
     adda(float rhs) const
     {
-        asm(
-            " ### ACC = vec_l_fields + float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vaddai.mask ACC, lhs, I \n"
-            : "=r"(vu0_ACC)
-            : "r rhs"(rhs),
-            "j lhs"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t lhs_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21     \n"
+            "vnop                               \n"
+            "vaddai     $ACC, $vf1, $I          \n"
+            "cfc2       %[acc], $vi16           \n"
+            : [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
     }
 
     template <class rhs_type>
@@ -546,29 +588,32 @@ class vec_template : public base_vec {
     suba(rhs_type rhs) const
     {
         const lhs_type& lhs = *this;
-        asm(
-            " ### ACC = vec_l_fields - vec_r_fields ### \n"
-            "vsuba_bc._mask ACC, lhs, rhs \n"
-            : "=r"(vu0_ACC)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2        \n"
+            "vsuba      $ACC, $vf1, $vf2        \n"
+            "cfc2       %[acc], $vi16           \n"
+            : [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
     }
 
     inline void
     suba(float rhs) const
     {
-        asm(
-            " ### ACC = vec_l_fields - float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vsubai.mask ACC, lhs, I \n"
-            : "=r"(vu0_ACC)
-            : "r rhs"(rhs),
-            "j lhs"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t lhs_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "qmtc2      %[lhs_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21     \n"
+            "vnop                               \n"
+            "vsubai     $ACC, $vf1, $I          \n"
+            "cfc2       %[acc], $vi16           \n"
+            : [acc] "=r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
     }
 
     // aadda (accumulator add, to accumulator): ACC = ACC + this
@@ -577,15 +622,26 @@ class vec_template : public base_vec {
     inline lhs_type
     aadda() const
     {
+        // Add this vector to ACC: ACC = ACC + this
+        // Then return ACC
+        vec128_t this_val = this->vec128;
         vec128_t result;
-        asm(
-            " ### ACC = ACC + vec_l_fields ### \n"
-            "vmaddaw.mask ACC, this, vf00 \n"
-
-            : "+r"(vu0_ACC)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            // First, read ACC into vf2: vf2 = ACC
+            "vsub       $vf2, $vf0, $vf0         \n"  // vf2 = 0
+            "vmaddw     $vf2, $vf2, $vf0         \n"  // vf2 = ACC + vf2 * vf0w = ACC + 0 = ACC
+            // Now add this vector: ACC = vf2 + vf1 = ACC + this
+            "vadda      $ACC, $vf2, $vf1         \n"  // ACC = vf2 + vf1 = (old ACC) + this
+            "cfc2       %[acc], $vi16            \n"
+            // Read result from ACC
+            "vsub       $vf3, $vf0, $vf0         \n"  // vf3 = 0
+            "vmaddw     $vf3, $vf3, $vf0         \n"  // vf3 = ACC + 0 = ACC
+            "qmfc2      %[result], $vf3          \n"
+            : [result] "=r"(result), [acc] "+r"(vu0_ACC)
+            : [this_val] "r"(this_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -593,14 +649,23 @@ class vec_template : public base_vec {
     asuba() const
     {
         vec128_t result;
-        asm(
-            " ### ACC = ACC - vec_l_fields ### \n"
-            "vmsubaw.mask ACC, this, vf00 \n"
-
-            : "+r"(vu0_ACC)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            // First, read ACC into vf2: vf2 = ACC
+            "vsub       $vf2, $vf0, $vf0         \n"  // vf2 = 0
+            "vmaddw     $vf2, $vf2, $vf0         \n"  // vf2 = ACC + 0 = ACC
+            // Now subtract this vector: ACC = vf2 - vf1 = ACC - this
+            "vsuba      $ACC, $vf2, $vf1         \n"  // ACC = vf2 - vf1 = (old ACC) - this
+            "cfc2       %[acc], $vi16            \n"
+            // Read result from ACC
+            "vsub       $vf3, $vf0, $vf0         \n"  // vf3 = 0
+            "vmaddw     $vf3, $vf3, $vf0         \n"  // vf3 = ACC + 0 = ACC
+            "qmfc2      %[result], $vf3          \n"
+            : [result] "=r"(result), [acc] "+r"(vu0_ACC)
+            : [this_val] "r"(this_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -612,16 +677,17 @@ class vec_template : public base_vec {
     madd(rhs_type rhs) const
     {
         vec128_t result;
-        asm(
-            " ### ACC + vec_l_fields * vec_r_fields ### \n"
-            "vmadd_bc._mask result, this, rhs \n"
-
-            : "=j result"(result)
-            : "j this"(*this), "j rhs"(rhs),
-            "vu_binary _bc _mask"(this->fields << 8 | rhs.fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields),
-            "r"(vu0_ACC));
+        vec128_t this_val = this->vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2         \n"
+            "vmadd      $vf3, $vf1, $vf2         \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val), [acc] "r"(vu0_ACC)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -629,16 +695,18 @@ class vec_template : public base_vec {
     madd(float rhs) const
     {
         vec128_t result;
-        asm(
-            " ### ACC + vec_l_fields * float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vmaddi._mask result, this, I \n"
-            : "=j result"(result)
-            : "j this"(*this), "r rhs"(rhs),
-            "vu_mask _mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "r"(vu0_ACC));
+        vec128_t this_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21       \n"
+            "vnop                               \n"
+            "vmaddi     $vf2, $vf1, $I           \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_bits] "r"(rhs_bits), [acc] "r"(vu0_ACC)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -647,16 +715,17 @@ class vec_template : public base_vec {
     msub(rhs_type rhs) const
     {
         vec128_t result;
-        asm(
-            " ### ACC - vec_l_fields * vec_r_fields ### \n"
-            "vmsub_bc._mask result, this, rhs \n"
-
-            : "=j result"(result)
-            : "j this"(*this), "j rhs"(rhs),
-            "vu_binary _bc _mask"(this->fields << 8 | rhs.fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields),
-            "r"(vu0_ACC));
+        vec128_t this_val = this->vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2         \n"
+            "vmsub      $vf3, $vf1, $vf2         \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val), [acc] "r"(vu0_ACC)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -664,16 +733,18 @@ class vec_template : public base_vec {
     msub(float rhs) const
     {
         vec128_t result;
-        asm(
-            " ### ACC - vec_l_fields * float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vmsubi._mask result, this, I \n"
-            : "=j result"(result)
-            : "j this"(*this), "r rhs"(rhs),
-            "vu_mask _mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "r"(vu0_ACC));
+        vec128_t this_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[this_val], $vf1        \n"
+            "ctc2       %[rhs_bits], $vi21       \n"
+            "vnop                               \n"
+            "vmsubi     $vf2, $vf1, $I           \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_bits] "r"(rhs_bits), [acc] "r"(vu0_ACC)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -685,29 +756,34 @@ class vec_template : public base_vec {
     madda(rhs_type rhs) const
     {
         const lhs_type& lhs = *this;
-        asm(
-            " ### ACC = ACC + vec_l_fields * vec_r_fields ### \n"
-            "vmadda_bc._mask ACC, lhs, rhs \n"
-            : "+r"(vu0_ACC)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[lhs_val], $vf1         \n"
+            "qmtc2      %[rhs_val], $vf2         \n"
+            "vmadda     $ACC, $vf1, $vf2         \n"
+            "cfc2       %[acc], $vi16            \n"
+            : [acc] "+r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
     }
 
     inline void
     madda(float rhs) const
     {
-        asm(
-            " ### ACC = ACC + vec_l_fields * float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vmaddai.mask ACC, lhs, I \n"
-            : "+r"(vu0_ACC)
-            : "r rhs"(rhs),
-            "j lhs"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t lhs_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[lhs_val], $vf1         \n"
+            "ctc2       %[rhs_bits], $vi21       \n"
+            "vnop                               \n"
+            "vmaddai    $ACC, $vf1, $I           \n"
+            "cfc2       %[acc], $vi16            \n"
+            : [acc] "+r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
     }
 
     template <class rhs_type>
@@ -715,42 +791,48 @@ class vec_template : public base_vec {
     msuba(rhs_type rhs) const
     {
         const lhs_type& lhs = *this;
-        asm(
-            " ### ACC = ACC - vec_l_fields * vec_r_fields ### \n"
-            "vmsuba_bc._mask ACC, lhs, rhs \n"
-            : "+r"(vu0_ACC)
-            : "j lhs"(lhs), "j rhs"(rhs),
-            "vu_binary _bc _mask"((lhs.fields << 8) | rhs.fields),
-            "vu_mask l_fields"(lhs.valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t lhs_val = lhs.vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[lhs_val], $vf1         \n"
+            "qmtc2      %[rhs_val], $vf2         \n"
+            "vmsuba     $ACC, $vf1, $vf2         \n"
+            "cfc2       %[acc], $vi16            \n"
+            : [acc] "+r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_val] "r"(rhs_val)
+            : "memory");
     }
 
     inline void
     msuba(float rhs) const
     {
-        asm(
-            " ### ACC = ACC - vec_l_fields * float ### \n"
-            "ctc2	rhs, $vi21 \n"
-            "vnop \n"
-            "vmsubai.mask ACC, lhs, I \n"
-            : "+r"(vu0_ACC)
-            : "r rhs"(rhs),
-            "j lhs"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask l_fields"(this->valid_fields));
+        vec128_t lhs_val = this->vec128;
+        uint32_t rhs_bits = *reinterpret_cast<uint32_t*>(&rhs);
+        asm __volatile__(
+            "ctc2       %[acc], $vi16            \n"
+            "qmtc2      %[lhs_val], $vf1         \n"
+            "ctc2       %[rhs_bits], $vi21       \n"
+            "vnop                               \n"
+            "vmsubai    $ACC, $vf1, $I           \n"
+            "cfc2       %[acc], $vi16            \n"
+            : [acc] "+r"(vu0_ACC)
+            : [lhs_val] "r"(lhs_val), [rhs_bits] "r"(rhs_bits)
+            : "memory");
     }
 
     inline lhs_type
     abs() const
     {
         vec128_t result;
-        asm(
-            " ### absolute value of vec_fields ### \n"
-            "vabs.mask result, this \n"
-            : "=j result"(result)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask fields"(this->valid_fields));
+        vec128_t this_val = this->vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "vabs       $vf2, $vf1               \n"
+            "qmfc2      %[result], $vf2         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -758,20 +840,45 @@ class vec_template : public base_vec {
     sign() const
     {
         vec128_t result;
-        int temp;
-        int ones = 0x3f800000;
-        asm(
-            " ### signs of vec_fields ### \n"
-            "vmulx.mask result, this, vf00 \n"
-            "pextlw ones, ones, ones \n"
-            "pextlw ones, ones, ones \n"
-            "qmfc2 temp, result \n"
-            "por temp, temp, ones \n"
-            "qmtc2 temp, result \n"
-            : "=j result"(result), "=r temp"(temp), "+r ones"(ones)
-            : "j this"(*this),
-            "vu_mask mask"(this->valid_fields),
-            "vu_mask fields"(this->valid_fields));
+        vec128_t this_val = this->vec128;
+        uint32_t macflag;
+        
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            // Get absolute value
+            "vabs       $vf2, $vf1               \n"
+            // Divide original by absolute value to get sign
+            // For non-zero: sign = original / abs = ±1
+            // For zero: division by zero will set MACflag
+            "vdiv       $Q, $vf0w, $vf2x         \n"
+            "vwaitq                              \n"
+            "vmulq.x    $vf3, $vf1, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf2y         \n"
+            "vwaitq                              \n"
+            "vmulq.y    $vf3, $vf1, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf2z         \n"
+            "vwaitq                              \n"
+            "vmulq.z    $vf3, $vf1, $Q          \n"
+            "vdiv       $Q, $vf0w, $vf2w         \n"
+            "vwaitq                              \n"
+            "vmulq.w    $vf3, $vf1, $Q          \n"
+            // Check MACflag for each component to detect zero
+            // If division by zero occurred, set that component to 0
+            "cfc2       %[macflag], $vi17        \n"
+            "andi       %[macflag], %[macflag], 8 \n"  // Check DZ (divide by zero) flag
+            "bgtz       %[macflag], 1f           \n"
+            "nop                                 \n"
+            "qmfc2      %[result], $vf3         \n"
+            "j          2f                       \n"
+            "nop                                 \n"
+            "1:                                  \n"
+            // If zero detected, set result to zero
+            "vsub       $vf3, $vf0, $vf0        \n"
+            "qmfc2      %[result], $vf3         \n"
+            "2:                                  \n"
+            : [result] "=r"(result), [macflag] "=r"(macflag)
+            : [this_val] "r"(this_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -780,13 +887,16 @@ class vec_template : public base_vec {
     max(rhs_type rhs) const
     {
         vec128_t result;
-        asm(" ### maximum of vec_l_fields and vec_r_fields ### \n"
-            "vmax_bc._mask result, this, rhs \n"
-            : "=j result"(result)
-            : "j this"(*this), "j rhs"(rhs),
-            "vu_binary _bc _mask"(this->fields << 8 | rhs.fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t this_val = this->vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2          \n"
+            "vmax       $vf3, $vf1, $vf2         \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -795,13 +905,16 @@ class vec_template : public base_vec {
     min(rhs_type rhs) const
     {
         vec128_t result;
-        asm(" ### minimum of vec_l_fields and vec_r_fields ### \n"
-            "vmini_bc._mask result, this, rhs \n"
-            : "=j result"(result)
-            : "j this"(*this), "j rhs"(rhs),
-            "vu_binary _bc _mask"(this->fields << 8 | rhs.fields),
-            "vu_mask l_fields"(this->valid_fields),
-            "vu_mask r_fields"(rhs.valid_fields));
+        vec128_t this_val = this->vec128;
+        vec128_t rhs_val = rhs.vec128;
+        asm __volatile__(
+            "qmtc2      %[this_val], $vf1        \n"
+            "qmtc2      %[rhs_val], $vf2          \n"
+            "vmini      $vf3, $vf1, $vf2         \n"
+            "qmfc2      %[result], $vf3         \n"
+            : [result] "=r"(result)
+            : [this_val] "r"(this_val), [rhs_val] "r"(rhs_val)
+            : "memory");
         return lhs_type(result);
     }
 
@@ -819,7 +932,7 @@ class vec_template : public base_vec {
             "qfsrv	%4, $0, %4 	# temp0 >>= 8 \n"
             "mtc1	%4, %3		# w = value.w \n"
             : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=r"(temp0)
-            : "r"(vec128));
+            : "r"(this->vec128));
 
         printf("(%f %f %f %f)\n", x, y, z, w);
     }
diff --git a/include/ps2s/vu.h b/include/ps2s/vu.h
index 460d4c9..64f7ff5 100644
--- a/include/ps2s/vu.h
+++ b/include/ps2s/vu.h
@@ -9,6 +9,7 @@
 
 #include "ps2s/debug.h"
 #include "ps2s/types.h"
+#include "ps2s/core.h"
 
 /********************************************
 	 * common
@@ -43,20 +44,25 @@ void VU0::CopyQwordsToVU0(uint32_t vu0QwordOffset, uint128_t* mainMemSrc, uint32
 {
     mAssert(((uint32_t)mainMemSrc & 0xf) == 0);
 
-    asm volatile("
-                         .set noreorder
-                             ctc2
-                     % 2,
-                 $vi03
-                     sll $8,
-                 % 0, 4 /* num bytes */
-                 addu $8,
-                 % 1, $8 /* ending address */
-                          .align 8 0
-                 : lqc2 vf02, 0(% 1) addiu % 1, % 1, 16 bne % 1, $8, 0b vsqi vf02, ($vi03++).set reorder "
-                 : "+r"(numQwords), "+r"(mainMemSrc)
-                 : "r"(vu0QwordOffset)
-                 : "cc", "$8");
+    // VU0 memory is mapped starting at VU0Code
+    // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16)
+    uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4);
+
+    asm __volatile__(
+        "addu      $8, %0, $zero   \n"
+        "sll       $9, %1, 4       \n"
+        "addu      $10, $8, $9     \n"
+        "addu      $11, %2, $zero  \n"
+        "0:                        \n"
+        "lqc2      $vf2, 0($8)     \n"
+        "addiu     $8, $8, 16      \n"
+        "sqc2      $vf2, 0($11)    \n"
+        "addiu     $11, $11, 16    \n"
+        "bne       $8, $10, 0b     \n"
+        "nop                       \n"
+        :
+        : "r"(mainMemSrc), "r"(numQwords), "r"(vu0_addr)
+        : "memory", "$8", "$9", "$10", "$11");
 }
 
 void VU0::CopyEvenQwordsToVU0(uint32_t vu0QwordOffset, uint128_t* mainMemSrc, uint32_t numQwords)
@@ -64,47 +70,54 @@ void VU0::CopyEvenQwordsToVU0(uint32_t vu0QwordOffset, uint128_t* mainMemSrc, ui
     mAssert(((uint32_t)mainMemSrc & 0xf) == 0);
     mErrorIf(numQwords & 1, "numQwords must be EVEN!");
 
-    asm volatile("
-                         .set noreorder
-
-                             ctc2
-                     % 2,
-                 $vi04
-                     sll $8,
-                 % 0, 4 /* num bytes */
-                 addu $8,
-                 % 1, $8 /* ending address */
-                          .balign 8 0
-                 : lqc2 vf02, 0(% 1) addiu % 1, % 1, 16 lqc2 vf03, 0(% 1) addiu % 1, % 1, 16 vsqi vf02, ($vi04++)nop bne % 1, $8, 0b vsqi vf03, ($vi04++)
-
-                                                                                                                                                    .set reorder "
-                 : "+r"(numQwords), "+r"(mainMemSrc)
-                 : "r"(vu0QwordOffset)
-                 : "cc", "$8");
+    // VU0 memory is mapped starting at VU0Code
+    // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16)
+    uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4);
+
+    asm __volatile__(
+        "addu      $8, %0, $zero   \n"
+        "sll       $9, %1, 4       \n"
+        "addu      $10, $8, $9     \n"
+        "addu      $11, %2, $zero  \n"
+        "0:                        \n"
+        "lqc2      $vf2, 0($8)     \n"
+        "addiu     $8, $8, 16      \n"
+        "lqc2      $vf3, 0($8)     \n"
+        "addiu     $8, $8, 16      \n"
+        "sqc2      $vf2, 0($11)    \n"
+        "addiu     $11, $11, 16    \n"
+        "sqc2      $vf3, 0($11)    \n"
+        "addiu     $11, $11, 16    \n"
+        "bne       $8, $10, 0b     \n"
+        "nop                       \n"
+        :
+        : "r"(mainMemSrc), "r"(numQwords), "r"(vu0_addr)
+        : "memory", "$8", "$9", "$10", "$11");
 }
 
 void VU0::CopyQwordsFromVU0(uint128_t* mainMemDest, uint32_t vu0QwordOffset, uint32_t numQwords)
 {
     mAssert(((uint32_t)mainMemDest & 0xf) == 0);
 
-    asm volatile("
-                         .set noreorder
-
-                             ctc2
-                     % 2,
-                 $vi02
-                         addi
-                     % 0,
-                 % 0, -16 sll $8, % 1, 4 /* num bytes */
-                 addu $8,
-                 $8, % 0 /* ending address */
-                           .balign 8 0
-                 : vlqi vf01, ($vi02++)addiu % 0, % 0, 16 bne % 0, $8, 0b sqc2 vf01, 0(% 0)
-
-                                                                                         .set reorder "
-                 : "+r"(mainMemDest)
-                 : "r"(numQwords), "r"(vu0QwordOffset)
-                 : "$8", "cc", "memory");
+    // VU0 memory is mapped starting at VU0Code
+    // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16)
+    uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4);
+
+    asm __volatile__(
+        "addu      $8, %0, $zero   \n"
+        "sll       $9, %1, 4       \n"
+        "addu      $10, $8, $9     \n"
+        "addu      $11, %2, $zero  \n"
+        "0:                        \n"
+        "lqc2      $vf1, 0($11)    \n"
+        "addiu     $11, $11, 16    \n"
+        "sqc2      $vf1, 0($8)     \n"
+        "addiu     $8, $8, 16      \n"
+        "bne       $8, $10, 0b     \n"
+        "nop                       \n"
+        :
+        : "r"(mainMemDest), "r"(numQwords), "r"(vu0_addr)
+        : "memory", "$8", "$9", "$10", "$11");
 }
 
 void VU0::CopyEvenQwordsFromVU0(uint128_t* mainMemDest, uint32_t vu0QwordOffset, uint32_t numQwords)
@@ -112,40 +125,29 @@ void VU0::CopyEvenQwordsFromVU0(uint128_t* mainMemDest, uint32_t vu0QwordOffset,
     mAssert(((uint32_t)mainMemDest & 0xf) == 0);
     mErrorIf(numQwords & 1, "numQwords must be EVEN!");
 
-    asm volatile("
-                         .set noreorder
-
-                             ctc2
-                     % 2,
-                 $vi02
-                         addi
-                     % 0,
-                 % 0, -16 sll $8, % 1, 4 /* num bytes */
-                 addu $8,
-                 $8, % 0 /* ending address */
-                           .balign 8 0
-                 : addiu % 0, % 0, 16 vlqi vf01, ($vi02++)
-
-                                                     vlqi vf02,
-                 ($vi02++)
-                     nop
-
-                         sqc2 vf01,
-                 0(% 0)
-                         nop
-
-                             addu
-                     % 0,
-                 % 0, 16 nop
-
-                          bne
-                     % 0,
-                 $8, 0b sqc2 vf02, 0(% 0)
-
-                                       .set reorder "
-                 : "+r"(mainMemDest)
-                 : "r"(numQwords), "r"(vu0QwordOffset)
-                 : "$8", "cc", "memory");
+    // VU0 memory is mapped starting at VU0Code
+    // Each qword is 16 bytes, so address = VU0Code + (vu0QwordOffset * 16)
+    uint32_t vu0_addr = Core::MemMappings::VU0Code + (vu0QwordOffset << 4);
+
+    asm __volatile__(
+        "addu      $8, %0, $zero   \n"
+        "sll       $9, %1, 4       \n"
+        "addu      $10, $8, $9     \n"
+        "addu      $11, %2, $zero  \n"
+        "0:                        \n"
+        "lqc2      $vf1, 0($11)    \n"
+        "addiu     $11, $11, 16    \n"
+        "sqc2      $vf1, 0($8)     \n"
+        "addiu     $8, $8, 16      \n"
+        "lqc2      $vf2, 0($11)    \n"
+        "addiu     $11, $11, 16    \n"
+        "sqc2      $vf2, 0($8)     \n"
+        "addiu     $8, $8, 16      \n"
+        "bne       $8, $10, 0b     \n"
+        "nop                       \n"
+        :
+        : "r"(mainMemDest), "r"(numQwords), "r"(vu0_addr)
+        : "memory", "$8", "$9", "$10", "$11");
 }
 
 #endif // ps2s_vu_h
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7c13456..bf32b4b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,4 +1,55 @@
 # Tests for ps2stuff
-# TODO: Add test executables here when test sources are available
 
-message(STATUS "Tests directory configured (no tests defined yet)")
+# VU0 path: test_shared.cpp compiled with VU0 assembly (default)
+add_executable(test_vu0.elf
+	test_shared.cpp
+)
+
+target_include_directories(test_vu0.elf PRIVATE
+	${CMAKE_SOURCE_DIR}/include
+)
+
+target_link_libraries(test_vu0.elf
+	ps2stuff
+	m
+)
+
+find_program(PS2CLIENT_EXECUTABLE ps2client)
+if(PS2CLIENT_EXECUTABLE)
+	add_custom_target(run_test_vu0
+		COMMAND ${PS2CLIENT_EXECUTABLE} execee host:test_vu0.elf
+		DEPENDS test_vu0.elf
+		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+		COMMENT "Running test_vu0.elf on PS2 via ps2client"
+	)
+endif()
+
+message(STATUS "Configured test target: test_vu0.elf")
+
+add_executable(test_cpu_shared.elf
+	test_shared.cpp
+)
+
+target_include_directories(test_cpu_shared.elf PRIVATE
+	${CMAKE_SOURCE_DIR}/include
+)
+
+target_compile_definitions(test_cpu_shared.elf PRIVATE
+	USE_CPU_COMPAT
+)
+
+target_link_libraries(test_cpu_shared.elf
+	ps2stuff
+	m
+)
+
+if(PS2CLIENT_EXECUTABLE)
+	add_custom_target(run_test_cpu_shared
+		COMMAND ${PS2CLIENT_EXECUTABLE} execee host:test_cpu_shared.elf
+		DEPENDS test_cpu_shared.elf
+		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+		COMMENT "Running test_cpu_shared.elf on PS2 via ps2client"
+	)
+endif()
+
+message(STATUS "Configured test target: test_cpu_shared.elf")
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..536a004
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,85 @@
+# VU0 Unit Tests
+
+This directory contains unit tests for the VU0 functions that were converted from custom GCC extensions to standard GCC inline assembly.
+
+## Building
+
+Configure and build from the repository root with tests enabled:
+
+```bash
+cmake -S . -B build -DBUILD_TESTS=ON
+cmake --build build -j
+```
+
+## Running
+
+To run the tests on a PS2 via ps2client:
+
+```bash
+cd build/tests
+ps2client execee host:test_vu0.elf
+```
+
+Or manually:
+
+```bash
+/Applications/PCSX2.app/Contents/MacOS/PCSX2 -elf $(pwd)/test_vu0.elf
+```
+
+Note on emulation differences:
+
+- Some PS2 VU floating-point corner cases are not emulated identically by
+	popular emulators (for example, specific VMUL cases where `1 * X` may not
+	equal `X` on real hardware). If tests behave differently under PCSX2 vs
+	real hardware, see: https://fobes.dev/ps2/detecting-emu-vu-floats and
+	consider running the affected tests on real hardware or making assertions
+	tolerant to small, platform-dependent float differences.
+
+## Test Coverage
+
+The tests cover the following VU0 functions:
+
+### Vector Operations (vec_xyz)
+- Addition, subtraction, multiplication, division
+- Scalar multiplication
+- Negation
+- Absolute value
+- Max/min operations
+- Normalization
+- Set operations (set_zero, set, set_vec)
+- Compound assignment operators (+=, -=, *=)
+- Dot product
+- Length and length squared
+- Cross product
+
+### Vector Operations (vec_xyzw)
+- Addition, subtraction
+- Scalar multiplication
+
+### Matrix Operations (mat_33)
+- set_zero
+- set_row/get_row
+- Matrix-vector multiplication
+
+### Matrix Operations (mat_44)
+- set_zero
+- set_row/get_row
+- Matrix-vector multiplication
+
+### Accumulator Operations
+- Basic accumulator function tests
+
+## Test Framework
+
+The test framework uses simple macros:
+- `ASSERT_EQ(a, b)` - Asserts two floats are equal within EPSILON
+- `ASSERT_VEC3_EQ(v, x, y, z)` - Asserts a vec_xyz matches expected values
+- `ASSERT_VEC_EQ(v, x, y, z, w)` - Asserts a vec_xyzw matches expected values
+- `RUN_TEST(func)` - Runs a test function and reports results
+
+## Notes
+
+- EPSILON is set to 1e-5f for floating-point comparisons
+- Tests use `vec_x(v)`, `vec_y(v)`, etc. to extract vector components
+- Some functions in vector.h still use "j" constraints and may need conversion if you encounter issues
+
diff --git a/tests/test_shared.cpp b/tests/test_shared.cpp
new file mode 100644
index 0000000..7303b8a
--- /dev/null
+++ b/tests/test_shared.cpp
@@ -0,0 +1,2546 @@
+/*	  Copyright (C) 2000,2001,2002  Sony Computer Entertainment America
+ *
+ *       	  This file is subject to the terms and conditions of the GNU Lesser
+ *	  General Public License Version 2.1. See the file "COPYING" in the
+ *	  main directory of this archive for more details.                             */
+
+#include <stdio.h>
+#include <math.h>
+#ifdef USE_CPU_COMPAT
+#include <ps2s/cpu_compat_types.h>
+#define NO_VU0_VECTORS
+#include <ps2s/cpu_vector.h>
+#include <ps2s/cpu_matrix.h>
+#undef NO_VU0_VECTORS
+#else
+#include <ps2s/vector.h>
+#include <ps2s/matrix.h>
+#include <ps2s/vu.h>
+#include <ps2s/ps2stuff.h>
+// Also exercise the cpu_vec_4 / cpu_mat_44 VU0-asm fast paths.
+#include <ps2s/cpu_vector.h>
+#include <ps2s/cpu_matrix.h>
+#endif
+#include <kernel.h>
+#include <string.h>
+
+// Test framework macros
+#define EPSILON 1e-5f
+#define ASSERT_EQ(a, b) \
+    do { \
+        if (fabsf((a) - (b)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected %.6f, got %.6f\n", __FILE__, __LINE__, (float)(b), (float)(a)); \
+            return 1; \
+        } \
+    } while(0)
+
+#define ASSERT_VEC_EQ(v, x, y, z, w) \
+    do { \
+        float vx = (float)vec_x(v); \
+        float vy = (float)vec_y(v); \
+        float vz = (float)vec_z(v); \
+        float vw = (float)vec_w(v); \
+        if (fabsf(vx - (x)) > EPSILON || \
+            fabsf(vy - (y)) > EPSILON || \
+            fabsf(vz - (z)) > EPSILON || \
+            fabsf(vw - (w)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f, %.6f)\n", \
+                   __FILE__, __LINE__, (float)(x), (float)(y), (float)(z), (float)(w), \
+                   vx, vy, vz, vw); \
+            return 1; \
+        } \
+    } while(0)
+
+#define ASSERT_VEC3_EQ(v, x, y, z) \
+    do { \
+        float vx = (float)vec_x(v); \
+        float vy = (float)vec_y(v); \
+        float vz = (float)vec_z(v); \
+        if (fabsf(vx - (x)) > EPSILON || \
+            fabsf(vy - (y)) > EPSILON || \
+            fabsf(vz - (z)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f)\n", \
+                   __FILE__, __LINE__, (float)(x), (float)(y), (float)(z), \
+                   vx, vy, vz); \
+            return 1; \
+        } \
+    } while(0)
+
+// CPU legacy type macros (cpu_vec_3, cpu_vec_4 have direct .x/.y/.z/.w members)
+#ifdef USE_CPU_COMPAT
+#define ASSERT_FLOAT_EQ(a, b) \
+    do { \
+        if (fabsf((float)(a) - (float)(b)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected %.6f, got %.6f\n", __FILE__, __LINE__, (float)(b), (float)(a)); \
+            return 1; \
+        } \
+    } while(0)
+#define ASSERT_CPU_VEC3_EQ(v, ex, ey, ez) \
+    do { \
+        if (fabsf((v).x - (ex)) > EPSILON || \
+            fabsf((v).y - (ey)) > EPSILON || \
+            fabsf((v).z - (ez)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f)\n", \
+                   __FILE__, __LINE__, (float)(ex), (float)(ey), (float)(ez), \
+                   (v).x, (v).y, (v).z); \
+            return 1; \
+        } \
+    } while(0)
+#define ASSERT_CPU_VEC4_EQ(v, ex, ey, ez, ew) \
+    do { \
+        if (fabsf((v).x - (ex)) > EPSILON || \
+            fabsf((v).y - (ey)) > EPSILON || \
+            fabsf((v).z - (ez)) > EPSILON || \
+            fabsf((v).w - (ew)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f, %.6f)\n", \
+                   __FILE__, __LINE__, \
+                   (float)(ex), (float)(ey), (float)(ez), (float)(ew), \
+                   (v).x, (v).y, (v).z, (v).w); \
+            return 1; \
+        } \
+    } while(0)
+#endif // USE_CPU_COMPAT
+
+// Assertions for cpu_vec_4 .x/.y/.z/.w (available in both builds — needed
+// by the VU0 build to test the new VU0-asm fast paths in cpu_vector.h /
+// cpu_matrix.h).
+#ifndef USE_CPU_COMPAT
+#define ASSERT_FLOAT_EQ(a, b) \
+    do { \
+        if (fabsf((float)(a) - (float)(b)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected %.6f, got %.6f\n", __FILE__, __LINE__, (float)(b), (float)(a)); \
+            return 1; \
+        } \
+    } while(0)
+#define ASSERT_CPU_VEC4_EQ(v, ex, ey, ez, ew) \
+    do { \
+        if (fabsf((v).x - (ex)) > EPSILON || \
+            fabsf((v).y - (ey)) > EPSILON || \
+            fabsf((v).z - (ez)) > EPSILON || \
+            fabsf((v).w - (ew)) > EPSILON) { \
+            printf("FAIL: %s:%d - Expected (%.6f, %.6f, %.6f, %.6f), got (%.6f, %.6f, %.6f, %.6f)\n", \
+                   __FILE__, __LINE__, \
+                   (float)(ex), (float)(ey), (float)(ez), (float)(ew), \
+                   (v).x, (v).y, (v).z, (v).w); \
+            return 1; \
+        } \
+    } while(0)
+#endif // !USE_CPU_COMPAT
+
+static int test_count = 0;
+static const char* failed_tests[200];
+static int failed_count = 0;
+
+#define RUN_TEST(test_func) \
+    do { \
+        test_count++; \
+        printf("Running %s...\n", #test_func); \
+        int result = test_func(); \
+        if (result != 0) { \
+            printf("TEST FAILED: %s\n\n", #test_func); \
+            if (failed_count < 200) { \
+                failed_tests[failed_count++] = #test_func; \
+            } \
+        } else { \
+            printf("PASS: %s\n\n", #test_func); \
+        } \
+    } while(0)
+
+// Emulator detection helper
+// Returns 1 if the VU multiply-1 bug (1 * X != X) is present (i.e. hardware),
+// or 0 if the environment behaves like common emulators.
+int isVUMulErrorPresent()
+{
+    float in = 129.5f;
+    float out = 0.0f;
+    asm __volatile__(
+        "QMTC2 %1, $vf1\n\t"          // Set VF1 to 129.5f
+        "VADDw $vf2, $vf0, $vf0w\n\t" // VF2 = vf0[w] = 1
+        "VMUL $vf1, $vf2, $vf1\n\t"   // VF1 = 1 * 129.5f
+        "QMFC2 %0, $vf1\n\t"          // Load result back
+        : "=r"(out)
+        : "r"(in)
+        : "memory");
+    return (in != out);
+}
+
+// Global flag set at test startup
+int g_vu_mul_emulator_bug = -1;
+
+// Test functions
+
+int test_vec3_addition()
+{
+    vec_xyz v1(1.0f, 2.0f, 3.0f);
+    vec_xyz v2(4.0f, 5.0f, 6.0f);
+    vec_xyz result = v1 + v2;
+    ASSERT_VEC3_EQ(result, 5.0f, 7.0f, 9.0f);
+    return 0;
+}
+
+int test_vec3_subtraction()
+{
+    vec_xyz v1(5.0f, 7.0f, 9.0f);
+    vec_xyz v2(1.0f, 2.0f, 3.0f);
+    vec_xyz result = v1 - v2;
+    ASSERT_VEC3_EQ(result, 4.0f, 5.0f, 6.0f);
+    return 0;
+}
+
+int test_vec3_multiplication()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(2.0f, 2.0f, 2.0f);
+    vec_xyz result = v1 * v2;
+    ASSERT_VEC3_EQ(result, 4.0f, 6.0f, 8.0f);
+    return 0;
+}
+
+int test_vec3_scalar_multiplication()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz result = v1 * 2.0f;
+    ASSERT_VEC3_EQ(result, 4.0f, 6.0f, 8.0f);
+    return 0;
+}
+
+int test_vec3_division()
+{
+    vec_xyz v1(8.0f, 6.0f, 4.0f);
+    vec_xyz v2(2.0f, 2.0f, 2.0f);
+    vec_xyz result = v1 / v2;
+    ASSERT_VEC3_EQ(result, 4.0f, 3.0f, 2.0f);
+    return 0;
+}
+
+int test_vec3_negate()
+{
+    vec_xyz v1(1.0f, 2.0f, 3.0f);
+    vec_xyz result = -v1;
+    ASSERT_VEC3_EQ(result, -1.0f, -2.0f, -3.0f);
+    return 0;
+}
+
+int test_vec3_abs()
+{
+    vec_xyz v1(-1.0f, 2.0f, -3.0f);
+    vec_xyz result = v1.abs();
+    ASSERT_VEC3_EQ(result, 1.0f, 2.0f, 3.0f);
+    return 0;
+}
+
+int test_vec3_max()
+{
+    vec_xyz v1(1.0f, 5.0f, 3.0f);
+    vec_xyz v2(4.0f, 2.0f, 6.0f);
+    vec_xyz result = v1.max(v2);
+    ASSERT_VEC3_EQ(result, 4.0f, 5.0f, 6.0f);
+    return 0;
+}
+
+int test_vec3_min()
+{
+    vec_xyz v1(1.0f, 5.0f, 3.0f);
+    vec_xyz v2(4.0f, 2.0f, 6.0f);
+    vec_xyz result = v1.min(v2);
+    ASSERT_VEC3_EQ(result, 1.0f, 2.0f, 3.0f);
+    return 0;
+}
+
+int test_vec3_normalized()
+{
+    vec_xyz v1(3.0f, 4.0f, 0.0f);  // Length = 5
+    vec_xyz result = v1.normalized();
+    float rx = (float)vec_x(result);
+    float ry = (float)vec_y(result);
+    float rz = (float)vec_z(result);
+    float len = sqrtf(rx * rx + ry * ry + rz * rz);
+    ASSERT_EQ(len, 1.0f);
+    ASSERT_VEC3_EQ(result, 0.6f, 0.8f, 0.0f);
+    return 0;
+}
+
+int test_vec3_set_zero()
+{
+    vec_xyz v1(1.0f, 2.0f, 3.0f);
+    v1.set_zero();
+    ASSERT_VEC3_EQ(v1, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_vec3_set()
+{
+    vec_xyz v1;
+    v1.set(5.0f);
+    ASSERT_VEC3_EQ(v1, 5.0f, 5.0f, 5.0f);
+    return 0;
+}
+
+int test_vec3_set_vec()
+{
+    vec_xyz v1(1.0f, 2.0f, 3.0f);
+    vec_xyz v2;
+    v2.set(v1);
+    ASSERT_VEC3_EQ(v2, 1.0f, 2.0f, 3.0f);
+    return 0;
+}
+
+int test_vec4_addition()
+{
+    vec_xyzw v1(1.0f, 2.0f, 3.0f, 4.0f);
+    vec_xyzw v2(5.0f, 6.0f, 7.0f, 8.0f);
+    vec_xyzw result = v1 + v2;
+    ASSERT_VEC_EQ(result, 6.0f, 8.0f, 10.0f, 12.0f);
+    return 0;
+}
+
+int test_vec4_subtraction()
+{
+    vec_xyzw v1(10.0f, 9.0f, 8.0f, 7.0f);
+    vec_xyzw v2(1.0f, 2.0f, 3.0f, 4.0f);
+    vec_xyzw result = v1 - v2;
+    ASSERT_VEC_EQ(result, 9.0f, 7.0f, 5.0f, 3.0f);
+    return 0;
+}
+
+int test_vec4_scalar_multiplication()
+{
+    vec_xyzw v1(1.0f, 2.0f, 3.0f, 4.0f);
+    vec_xyzw result = v1 * 3.0f;
+    ASSERT_VEC_EQ(result, 3.0f, 6.0f, 9.0f, 12.0f);
+    return 0;
+}
+
+int test_mat33_set_zero()
+{
+    mat_33 m;
+    m.set_zero();
+    vec_xyz col0 = m.get_col0();
+    vec_xyz col1 = m.get_col1();
+    vec_xyz col2 = m.get_col2();
+    ASSERT_VEC3_EQ(col0, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(col1, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(col2, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat33_set_row()
+{
+    mat_33 m;
+    m.set_zero();
+    vec_xyz row0(1.0f, 2.0f, 3.0f);
+    m.set_row0(row0);
+    vec_xyz col0 = m.get_col0();
+    vec_xyz col1 = m.get_col1();
+    vec_xyz col2 = m.get_col2();
+    ASSERT_VEC3_EQ(col0, 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(col1, 2.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(col2, 3.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat33_get_row()
+{
+    mat_33 m;
+    m.set_zero();
+    vec_xyz row0(1.0f, 2.0f, 3.0f);
+    m.set_row0(row0);
+    vec_xyz retrieved_row0 = m.get_row0();
+    ASSERT_VEC3_EQ(retrieved_row0, 1.0f, 2.0f, 3.0f);
+    return 0;
+}
+
+int test_mat33_multiply_vector()
+{
+    mat_33 m;
+    m.set_zero();
+    m.set_row0(vec_xyz(1.0f, 0.0f, 0.0f));
+    m.set_row1(vec_xyz(0.0f, 2.0f, 0.0f));
+    m.set_row2(vec_xyz(0.0f, 0.0f, 3.0f));
+    
+    vec_xyz v(1.0f, 2.0f, 3.0f);
+    vec_xyz result = m * v;
+    ASSERT_VEC3_EQ(result, 1.0f, 4.0f, 9.0f);
+    return 0;
+}
+
+int test_mat44_set_zero()
+{
+    mat_44 m;
+    m.set_zero();
+    vec_xyzw col0 = m.get_col0();
+    vec_xyzw col1 = m.get_col1();
+    vec_xyzw col2 = m.get_col2();
+    vec_xyzw col3 = m.get_col3();
+    ASSERT_VEC_EQ(col0, 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col1, 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col2, 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col3, 0.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat44_set_row()
+{
+    mat_44 m;
+    m.set_zero();
+    vec_xyzw row0(1.0f, 2.0f, 3.0f, 4.0f);
+    m.set_row0(row0);
+    vec_xyzw col0 = m.get_col0();
+    vec_xyzw col1 = m.get_col1();
+    vec_xyzw col2 = m.get_col2();
+    vec_xyzw col3 = m.get_col3();
+    ASSERT_VEC_EQ(col0, 1.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col1, 2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col2, 3.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col3, 4.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat44_multiply_vector()
+{
+    mat_44 m;
+    m.set_zero();
+    m.set_row0(vec_xyzw(1.0f, 0.0f, 0.0f, 0.0f));
+    m.set_row1(vec_xyzw(0.0f, 2.0f, 0.0f, 0.0f));
+    m.set_row2(vec_xyzw(0.0f, 0.0f, 3.0f, 0.0f));
+    m.set_row3(vec_xyzw(0.0f, 0.0f, 0.0f, 4.0f));
+    
+    vec_xyzw v(1.0f, 2.0f, 3.0f, 4.0f);
+    vec_xyzw result = m * v;
+    ASSERT_VEC_EQ(result, 1.0f, 4.0f, 9.0f, 16.0f);
+    return 0;
+}
+
+#ifndef USE_CPU_COMPAT
+int test_accumulator_operations()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(1.0f, 1.0f, 1.0f);
+    
+    // Test to_a
+    v1.to_a();
+    
+    // Test mula
+    v2.mula(v2);
+    
+    // Test aadd
+    vec_xyz result = v1.aadd();
+    // The accumulator should contain v1, so aadd should add v1 to itself
+    // This is a basic test to ensure the accumulator functions compile and run
+    return 0;
+}
+#endif // !USE_CPU_COMPAT
+
+// Test normalized with zero vector (branch)
+int test_vec3_normalized_zero()
+{
+    vec_xyz v(0.0f, 0.0f, 0.0f);
+    vec_xyz result = v.normalized();
+    ASSERT_VEC3_EQ(result, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// Test truncate_length branch (vector shorter than max)
+int test_vec3_truncate_length_shorter()
+{
+    vec_xyz v(1.0f, 2.0f, 0.0f);  // Length = sqrt(5) ≈ 2.236
+    vec_xyz result = v.truncate_length(5.0f);
+    ASSERT_VEC3_EQ(result, 1.0f, 2.0f, 0.0f);  // Should return unchanged
+    return 0;
+}
+
+// Test truncate_length branch (vector longer than max)
+int test_vec3_truncate_length_longer()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);  // Length = 5
+    vec_xyz result = v.truncate_length(3.0f);  // Max length = 3
+    float len = result.length();
+    ASSERT_EQ(len, 3.0f);
+    return 0;
+}
+
+// Test is_zero
+int test_vec3_is_zero()
+{
+    vec_xyz v1(0.0f, 0.0f, 0.0f);
+    vec_xyz v2(1.0f, 0.0f, 0.0f);
+    int zero1 = v1.is_zero();
+    int zero2 = v2.is_zero();
+    if (zero1 != 1) return 1;
+    if (zero2 != 0) return 1;
+    return 0;
+}
+
+// Test sign function
+int test_vec3_sign()
+{
+    vec_xyz v1(5.0f, -3.0f, 0.0f);
+    vec_xyz result = v1.sign();
+    float rx = (float)vec_x(result);
+    float ry = (float)vec_y(result);
+    float rz = (float)vec_z(result);
+    if (rx < 0.9f || rx > 1.1f) return 1;  // Should be ~1.0
+    if (ry > -0.9f || ry < -1.1f) return 1;  // Should be ~-1.0
+    if (fabsf(rz) > 0.1f) return 1;  // Should be ~0.0
+    return 0;
+}
+
+// Test one_over - commented out due to incompatible assembly constraints ("j" constraint)
+int test_vec3_one_over()
+{
+    vec_xyz v(2.0f, 4.0f, 8.0f);
+    vec_xyz result = v.one_over();
+    float rx = (float)vec_x(result);
+    float ry = (float)vec_y(result);
+    float rz = (float)vec_z(result);
+    ASSERT_EQ(rx, 0.5f);
+    ASSERT_EQ(ry, 0.25f);
+    ASSERT_EQ(rz, 0.125f);
+    return 0;
+}
+
+// Test interpolate
+int test_vec3_interpolate()
+{
+    vec_xyz v1(0.0f, 0.0f, 0.0f);
+    vec_xyz v2(10.0f, 20.0f, 30.0f);
+    vec_xyz result = v1.interpolate(0.5f, v2);
+    ASSERT_VEC3_EQ(result, 5.0f, 10.0f, 15.0f);
+    return 0;
+}
+
+// Test distance_from
+int test_vec3_distance_from()
+{
+    vec_xyz v1(0.0f, 0.0f, 0.0f);
+    vec_xyz v2(3.0f, 4.0f, 0.0f);
+    float dist = (float)v1.distance_from(v2);
+    ASSERT_EQ(dist, 5.0f);
+    return 0;
+}
+
+// Test set_length
+int test_vec3_set_length()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);  // Length = 5
+    vec_xyz result = v.set_length(10.0f);  // Set to length 10
+    float len = result.length();
+    ASSERT_EQ(len, 10.0f);
+    ASSERT_VEC3_EQ(result, 6.0f, 8.0f, 0.0f);  // Scaled by 2
+    return 0;
+}
+
+// Test parallel_component
+int test_vec3_parallel_component()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);
+    vec_xyz basis(1.0f, 0.0f, 0.0f);  // Unit vector in x direction
+    vec_xyz result = v.parallel_component(basis);
+    ASSERT_VEC3_EQ(result, 3.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// Test perpendicular_component
+int test_vec3_perpendicular_component()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);
+    vec_xyz basis(1.0f, 0.0f, 0.0f);  // Unit vector in x direction
+    vec_xyz result = v.perpendicular_component(basis);
+    ASSERT_VEC3_EQ(result, 0.0f, 4.0f, 0.0f);
+    return 0;
+}
+
+// Test normalize (in-place)
+int test_vec3_normalize()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);
+    v.normalize();
+    float len = v.length();
+    ASSERT_EQ(len, 1.0f);
+    ASSERT_VEC3_EQ(v, 0.6f, 0.8f, 0.0f);
+    return 0;
+}
+
+#ifndef USE_CPU_COMPAT
+// Test accumulator: to_a and from_a
+int test_vec3_to_a_from_a()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz v2(0.0f, 0.0f, 0.0f);
+    v2.from_a();  // v2 = ACC
+    ASSERT_VEC3_EQ(v2, 2.0f, 3.0f, 4.0f);
+    return 0;
+}
+
+// Test accumulator: aadd
+int test_vec3_aadd()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(1.0f, 1.0f, 1.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz result = v2.aadd();  // result = ACC + v2 = v1 + v2
+    ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f);
+    return 0;
+}
+
+// Test accumulator: asub
+int test_vec3_asub()
+{
+    vec_xyz v1(5.0f, 5.0f, 5.0f);
+    vec_xyz v2(2.0f, 3.0f, 4.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz result = v2.asub();  // result = ACC - v2 = v1 - v2
+    ASSERT_VEC3_EQ(result, 3.0f, 2.0f, 1.0f);
+    return 0;
+}
+
+// Test accumulator: madd
+int test_vec3_madd()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(1.0f, 1.0f, 1.0f);
+    vec_xyz v3(1.0f, 1.0f, 1.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz result = v2.madd(v3);  // result = ACC + v2 * v3 = v1 + v2 * v3
+    ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f);
+    return 0;
+}
+
+// Test accumulator: msub
+int test_vec3_msub()
+{
+    vec_xyz v1(5.0f, 5.0f, 5.0f);
+    vec_xyz v2(2.0f, 2.0f, 2.0f);
+    vec_xyz v3(1.0f, 1.0f, 1.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz result = v2.msub(v3);  // result = ACC - v2 * v3 = v1 - v2 * v3
+    ASSERT_VEC3_EQ(result, 3.0f, 3.0f, 3.0f);
+    return 0;
+}
+
+// Test accumulator: madda
+int test_vec3_madda()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(1.0f, 1.0f, 1.0f);
+    v1.to_a();  // ACC = v1
+    v2.madda(v2);  // ACC = ACC + v2 * v2 = v1 + v2 * v2
+    vec_xyz result;
+    result.from_a();  // result = ACC
+    ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f);
+    return 0;
+}
+
+// Test accumulator: msuba
+int test_vec3_msuba()
+{
+    vec_xyz v1(5.0f, 5.0f, 5.0f);
+    vec_xyz v2(2.0f, 2.0f, 2.0f);
+    v1.to_a();  // ACC = v1
+    v2.msuba(v2);  // ACC = ACC - v2 * v2 = v1 - v2 * v2
+    vec_xyz result;
+    result.from_a();  // result = ACC
+    ASSERT_VEC3_EQ(result, 1.0f, 1.0f, 1.0f);
+    return 0;
+}
+
+// Test accumulator: aadda - COMMENTED OUT: failing test
+int test_vec3_aadda()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(1.0f, 1.0f, 1.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz result = v2.aadda();  // result = ACC + v2, ACC = ACC + v2
+    ASSERT_VEC3_EQ(result, 3.0f, 4.0f, 5.0f);
+    return 0;
+}
+
+// Test accumulator: asuba
+int test_vec3_asuba()
+{
+    vec_xyz v1(5.0f, 5.0f, 5.0f);
+    vec_xyz v2(2.0f, 3.0f, 4.0f);
+    v1.to_a();  // ACC = v1
+    vec_xyz result = v2.asuba();  // result = ACC - v2, ACC = ACC - v2
+    ASSERT_VEC3_EQ(result, 3.0f, 2.0f, 1.0f);
+    return 0;
+}
+#endif // !USE_CPU_COMPAT
+
+int test_vec3_operator_plus_equals()
+{
+    vec_xyz v1(1.0f, 2.0f, 3.0f);
+    vec_xyz v2(4.0f, 5.0f, 6.0f);
+    v1 += v2;
+    ASSERT_VEC3_EQ(v1, 5.0f, 7.0f, 9.0f);
+    return 0;
+}
+
+int test_vec3_operator_minus_equals()
+{
+    vec_xyz v1(10.0f, 9.0f, 8.0f);
+    vec_xyz v2(1.0f, 2.0f, 3.0f);
+    v1 -= v2;
+    ASSERT_VEC3_EQ(v1, 9.0f, 7.0f, 5.0f);
+    return 0;
+}
+
+int test_vec3_operator_multiply_equals()
+{
+    vec_xyz v1(2.0f, 3.0f, 4.0f);
+    vec_xyz v2(2.0f, 2.0f, 2.0f);
+    v1 *= v2;
+    ASSERT_VEC3_EQ(v1, 4.0f, 6.0f, 8.0f);
+    return 0;
+}
+
+int test_vec3_dot_product()
+{
+    vec_xyz v1(1.0f, 2.0f, 3.0f);
+    vec_xyz v2(4.0f, 5.0f, 6.0f);
+    float dot = v1.dot(v2);
+    ASSERT_EQ(dot, 32.0f);  // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+    return 0;
+}
+
+int test_vec3_length()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);
+    float len = v.length();
+    ASSERT_EQ(len, 5.0f);
+    return 0;
+}
+
+int test_vec3_length_sqr()
+{
+    vec_xyz v(3.0f, 4.0f, 0.0f);
+    float len_sqr = v.length_sqr();
+    ASSERT_EQ(len_sqr, 25.0f);
+    return 0;
+}
+
+int test_vec3_cross_product()
+{
+    vec_xyz v1(1.0f, 0.0f, 0.0f);
+    vec_xyz v2(0.0f, 1.0f, 0.0f);
+    vec_xyz result = v1.cross(v2);
+    ASSERT_VEC3_EQ(result, 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// Matrix tests
+int test_mat33_set_identity()
+{
+    mat_33 m;
+    m.set_identity();
+    vec_xyz col0 = m.get_col0();
+    vec_xyz col1 = m.get_col1();
+    vec_xyz col2 = m.get_col2();
+    ASSERT_VEC3_EQ(col0, 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(col1, 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(col2, 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+int test_mat33_set_scale()
+{
+    mat_33 m;
+    vec_xyz scale(2.0f, 3.0f, 4.0f);
+    m.set_scale(scale);
+    vec_xyz col0 = m.get_col0();
+    vec_xyz col1 = m.get_col1();
+    vec_xyz col2 = m.get_col2();
+    ASSERT_VEC3_EQ(col0, 2.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(col1, 0.0f, 3.0f, 0.0f);
+    ASSERT_VEC3_EQ(col2, 0.0f, 0.0f, 4.0f);
+    return 0;
+}
+
+int test_mat33_transpose()
+{
+    mat_33 m;
+    m.set_row0(vec_xyz(1.0f, 2.0f, 3.0f));
+    m.set_row1(vec_xyz(4.0f, 5.0f, 6.0f));
+    m.set_row2(vec_xyz(7.0f, 8.0f, 9.0f));
+    mat_33 transposed = m.transpose();
+    vec_xyz row0 = transposed.get_row0();
+    vec_xyz row1 = transposed.get_row1();
+    vec_xyz row2 = transposed.get_row2();
+    ASSERT_VEC3_EQ(row0, 1.0f, 4.0f, 7.0f);
+    ASSERT_VEC3_EQ(row1, 2.0f, 5.0f, 8.0f);
+    ASSERT_VEC3_EQ(row2, 3.0f, 6.0f, 9.0f);
+    return 0;
+}
+
+int test_mat33_get_row1()
+{
+    mat_33 m;
+    m.set_zero();
+    vec_xyz row1(10.0f, 20.0f, 30.0f);
+    m.set_row1(row1);
+    vec_xyz retrieved_row1 = m.get_row1();
+    ASSERT_VEC3_EQ(retrieved_row1, 10.0f, 20.0f, 30.0f);
+    return 0;
+}
+
+int test_mat33_get_row2()
+{
+    mat_33 m;
+    m.set_zero();
+    vec_xyz row2(100.0f, 200.0f, 300.0f);
+    m.set_row2(row2);
+    vec_xyz retrieved_row2 = m.get_row2();
+    ASSERT_VEC3_EQ(retrieved_row2, 100.0f, 200.0f, 300.0f);
+    return 0;
+}
+
+int test_mat44_set_identity()
+{
+    mat_44 m;
+    m.set_identity();
+    vec_xyzw col0 = m.get_col0();
+    vec_xyzw col1 = m.get_col1();
+    vec_xyzw col2 = m.get_col2();
+    vec_xyzw col3 = m.get_col3();
+    ASSERT_VEC_EQ(col0, 1.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col1, 0.0f, 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col2, 0.0f, 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC_EQ(col3, 0.0f, 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+int test_mat44_set_scale()
+{
+    mat_44 m;
+    vec_xyz scale(2.0f, 3.0f, 4.0f);
+    m.set_scale(scale);
+    vec_xyzw col0 = m.get_col0();
+    vec_xyzw col1 = m.get_col1();
+    vec_xyzw col2 = m.get_col2();
+    vec_xyzw col3 = m.get_col3();
+    ASSERT_VEC_EQ(col0, 2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col1, 0.0f, 3.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(col2, 0.0f, 0.0f, 4.0f, 0.0f);
+    ASSERT_VEC_EQ(col3, 0.0f, 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+int test_mat44_set_translate()
+{
+    mat_44 m;
+    vec_xyz trans(10.0f, 20.0f, 30.0f);
+    m.set_translate(trans);
+    vec_xyzw col3 = m.get_col3();
+    ASSERT_VEC_EQ(col3, 10.0f, 20.0f, 30.0f, 1.0f);
+    return 0;
+}
+
+int test_mat44_get_row1()
+{
+    mat_44 m;
+    m.set_zero();
+    vec_xyzw row1(10.0f, 20.0f, 30.0f, 40.0f);
+    m.set_row1(row1);
+    vec_xyzw retrieved_row1 = m.get_row1();
+    ASSERT_VEC_EQ(retrieved_row1, 10.0f, 20.0f, 30.0f, 40.0f);
+    return 0;
+}
+
+int test_mat44_get_row2()
+{
+    mat_44 m;
+    m.set_zero();
+    vec_xyzw row2(100.0f, 200.0f, 300.0f, 400.0f);
+    m.set_row2(row2);
+    vec_xyzw retrieved_row2 = m.get_row2();
+    ASSERT_VEC_EQ(retrieved_row2, 100.0f, 200.0f, 300.0f, 400.0f);
+    return 0;
+}
+
+int test_mat44_get_row3()
+{
+    mat_44 m;
+    m.set_zero();
+    vec_xyzw row3(1000.0f, 2000.0f, 3000.0f, 4000.0f);
+    m.set_row3(row3);
+    vec_xyzw retrieved_row3 = m.get_row3();
+    ASSERT_VEC_EQ(retrieved_row3, 1000.0f, 2000.0f, 3000.0f, 4000.0f);
+    return 0;
+}
+
+int test_mat44_transpose()
+{
+    mat_44 m;
+    m.set_zero();  // Initialize matrix first
+    m.set_row0(vec_xyzw(1.0f, 2.0f, 3.0f, 4.0f));
+    m.set_row1(vec_xyzw(5.0f, 6.0f, 7.0f, 8.0f));
+    m.set_row2(vec_xyzw(9.0f, 10.0f, 11.0f, 12.0f));
+    m.set_row3(vec_xyzw(13.0f, 14.0f, 15.0f, 16.0f));
+    mat_44 transposed = m.transpose();
+    vec_xyzw row0 = transposed.get_row0();
+    vec_xyzw row1 = transposed.get_row1();
+    vec_xyzw row2 = transposed.get_row2();
+    vec_xyzw row3 = transposed.get_row3();
+    ASSERT_VEC_EQ(row0, 1.0f, 5.0f, 9.0f, 13.0f);
+    ASSERT_VEC_EQ(row1, 2.0f, 6.0f, 10.0f, 14.0f);
+    ASSERT_VEC_EQ(row2, 3.0f, 7.0f, 11.0f, 15.0f);
+    ASSERT_VEC_EQ(row3, 4.0f, 8.0f, 12.0f, 16.0f);
+    return 0;
+}
+
+// Test vec_xyzw normalized3 and normalize3
+int test_vec4_normalized3()
+{
+    vec_xyzw v(3.0f, 4.0f, 0.0f, 0.0f);  // Length = 5
+    vec_xyzw result = v.normalized3();
+    float rx = (float)vec_x(result);
+    float ry = (float)vec_y(result);
+    float rz = (float)vec_z(result);
+    float len = sqrtf(rx * rx + ry * ry + rz * rz);
+    ASSERT_EQ(len, 1.0f);
+    return 0;
+}
+
+#ifndef USE_CPU_COMPAT
+// Test VU0 copy functions
+int test_vu0_copy_qwords()
+{
+    // Test CopyQwordsToVU0 and CopyQwordsFromVU0
+    uint128_t src[4];
+    uint128_t dst[4];
+    
+    // Initialize source data
+    memset(src, 0, sizeof(src));
+    memset(dst, 0, sizeof(dst));
+    
+    // Set up test data (simple pattern)
+    uint32_t* src_ptr = (uint32_t*)src;
+    for (int i = 0; i < 16; i++) {
+        src_ptr[i] = i + 1;
+    }
+    
+    // Copy to VU0
+    VU0::CopyQwordsToVU0(0, src, 4);
+    
+    // Copy back from VU0
+    VU0::CopyQwordsFromVU0(dst, 0, 4);
+    
+    // Verify data matches
+    for (int i = 0; i < 16; i++) {
+        if (src_ptr[i] != ((uint32_t*)dst)[i]) {
+            return 1;
+        }
+    }
+    
+    return 0;
+}
+
+int test_vu0_copy_even_qwords()
+{
+    // Test CopyEvenQwordsToVU0 and CopyEvenQwordsFromVU0
+    uint128_t src[4];
+    uint128_t dst[4];
+    
+    // Initialize source data
+    memset(src, 0, sizeof(src));
+    memset(dst, 0, sizeof(dst));
+    
+    // Set up test data
+    uint32_t* src_ptr = (uint32_t*)src;
+    for (int i = 0; i < 16; i++) {
+        src_ptr[i] = i + 100;
+    }
+    
+    // Copy even qwords to VU0 (must be even number)
+    VU0::CopyEvenQwordsToVU0(0, src, 4);
+    
+    // Copy back from VU0
+    VU0::CopyEvenQwordsFromVU0(dst, 0, 4);
+    
+    // Verify data matches
+    for (int i = 0; i < 16; i++) {
+        if (src_ptr[i] != ((uint32_t*)dst)[i]) {
+            return 1;
+        }
+    }
+    
+    return 0;
+}
+#endif // !USE_CPU_COMPAT
+
+int test_vec4_normalize3()
+{
+    vec_xyzw v(3.0f, 4.0f, 0.0f, 5.0f);
+    v.normalize3();
+    float rx = (float)vec_x(v);
+    float ry = (float)vec_y(v);
+    float rz = (float)vec_z(v);
+    float len = sqrtf(rx * rx + ry * ry + rz * rz);
+    ASSERT_EQ(len, 1.0f);
+    return 0;
+}
+
+// ---- mat_43 tests ----
+
+int test_mat43_multiply_vector()
+{
+    // mat_43 * vec_3 = vec_4  (column-major: result = col0*v.x + col1*v.y + col2*v.z)
+    mat_43 m(vec_4(2.0f, 1.0f, 0.0f, 3.0f),
+             vec_4(0.0f, 3.0f, 1.0f, 2.0f),
+             vec_4(1.0f, 0.0f, 4.0f, 1.0f));
+    vec_3 v(1.0f, 2.0f, 3.0f);
+    vec_4 r = m * v;
+    // r = col0*1 + col1*2 + col2*3
+    //   = (2,1,0,3) + (0,6,2,4) + (3,0,12,3) = (5,7,14,10)
+    ASSERT_VEC_EQ(r, 5.0f, 7.0f, 14.0f, 10.0f);
+    return 0;
+}
+
+// ---- mat_34 tests ----
+
+int test_mat34_multiply_vector()
+{
+    // mat_34 * vec_4 = vec_3  (result = col0*v.x + col1*v.y + col2*v.z + col3*v.w)
+    mat_34 m(vec_3(1.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 1.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 1.0f),
+             vec_3(5.0f, 6.0f, 7.0f));
+    vec_4 v(1.0f, 2.0f, 3.0f, 1.0f);
+    vec_3 r = m * v;
+    // r = (1,0,0)*1 + (0,1,0)*2 + (0,0,1)*3 + (5,6,7)*1 = (6,8,10)
+    ASSERT_VEC3_EQ(r, 6.0f, 8.0f, 10.0f);
+    return 0;
+}
+
+// ---- mat_33 matrix * matrix ----
+
+int test_mat33_multiply_mat33()
+{
+    // A * I = A
+    mat_33 A;
+    A.col0.set(1.0f, 2.0f, 3.0f);
+    A.col1.set(4.0f, 5.0f, 6.0f);
+    A.col2.set(7.0f, 8.0f, 9.0f);
+    mat_33 I;
+    I.set_identity();
+    mat_33 R = A * I;
+    // R should equal A
+    ASSERT_VEC3_EQ(R.get_row0(), 1.0f, 4.0f, 7.0f);
+    ASSERT_VEC3_EQ(R.get_row1(), 2.0f, 5.0f, 8.0f);
+    ASSERT_VEC3_EQ(R.get_row2(), 3.0f, 6.0f, 9.0f);
+    return 0;
+}
+
+int test_mat33_multiply_mat33_nontrivial()
+{
+    // A * B where neither is identity
+    mat_33 A, B;
+    A.col0.set(1.0f, 2.0f, 3.0f);
+    A.col1.set(4.0f, 5.0f, 6.0f);
+    A.col2.set(7.0f, 8.0f, 9.0f);
+    B.col0.set(9.0f, 8.0f, 7.0f);
+    B.col1.set(6.0f, 5.0f, 4.0f);
+    B.col2.set(3.0f, 2.0f, 1.0f);
+    mat_33 C = A * B;
+    // C_col0 = A * B_col0 = A*(9,8,7) = col0*9+col1*8+col2*7
+    //        = (1,2,3)*9+(4,5,6)*8+(7,8,9)*7 = (9+32+49, 18+40+56, 27+48+63) = (90,114,138)
+    // C_col1 = A*(6,5,4) = (1,2,3)*6+(4,5,6)*5+(7,8,9)*4 = (6+20+28, 12+25+32, 18+30+36) = (54,69,84)
+    // C_col2 = A*(3,2,1) = (1,2,3)*3+(4,5,6)*2+(7,8,9)*1 = (3+8+7, 6+10+8, 9+12+9) = (18,24,30)
+    ASSERT_VEC3_EQ(C.get_row0(), 90.0f, 54.0f, 18.0f);
+    ASSERT_VEC3_EQ(C.get_row1(), 114.0f, 69.0f, 24.0f);
+    ASSERT_VEC3_EQ(C.get_row2(), 138.0f, 84.0f, 30.0f);
+    return 0;
+}
+
+// ---- mat_44 matrix * matrix ----
+
+int test_mat44_multiply_mat44()
+{
+    // A * I = A
+    mat_44 A;
+    A.col0.set(1.0f, 0.0f, 0.0f, 0.0f);
+    A.col1.set(0.0f, 2.0f, 0.0f, 0.0f);
+    A.col2.set(0.0f, 0.0f, 3.0f, 0.0f);
+    A.col3.set(4.0f, 5.0f, 6.0f, 1.0f);
+    mat_44 I;
+    I.set_identity();
+    mat_44 R = A * I;
+    ASSERT_VEC_EQ(R.get_row0(), 1.0f, 0.0f, 0.0f, 4.0f);
+    ASSERT_VEC_EQ(R.get_row1(), 0.0f, 2.0f, 0.0f, 5.0f);
+    ASSERT_VEC_EQ(R.get_row2(), 0.0f, 0.0f, 3.0f, 6.0f);
+    ASSERT_VEC_EQ(R.get_row3(), 0.0f, 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// ---- matrix negate ----
+
+int test_mat33_negate()
+{
+    mat_33 m;
+    m.col0.set(1.0f, 2.0f, 3.0f);
+    m.col1.set(4.0f, 5.0f, 6.0f);
+    m.col2.set(7.0f, 8.0f, 9.0f);
+    mat_33 n = -m;
+    ASSERT_VEC3_EQ(n.get_row0(), -1.0f, -4.0f, -7.0f);
+    ASSERT_VEC3_EQ(n.get_row1(), -2.0f, -5.0f, -8.0f);
+    ASSERT_VEC3_EQ(n.get_row2(), -3.0f, -6.0f, -9.0f);
+    return 0;
+}
+
+int test_mat44_negate()
+{
+    mat_44 M;
+    M.set_identity();
+    M.col3.set(1.0f, 2.0f, 3.0f, 4.0f);
+    mat_44 N = -M;
+    ASSERT_VEC_EQ(N.get_row0(), -1.0f,  0.0f,  0.0f, -1.0f);
+    ASSERT_VEC_EQ(N.get_row1(),  0.0f, -1.0f,  0.0f, -2.0f);
+    ASSERT_VEC_EQ(N.get_row2(),  0.0f,  0.0f, -1.0f, -3.0f);
+    ASSERT_VEC_EQ(N.get_row3(),  0.0f,  0.0f,  0.0f, -4.0f);
+    return 0;
+}
+
+// ---- mat_43 basic construction and row access ----
+
+int test_mat43_set_zero()
+{
+    mat_43 m;
+    m.set_zero();
+    ASSERT_VEC3_EQ(m.get_row0(), 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(m.get_row1(), 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(m.get_row2(), 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(m.get_row3(), 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat43_get_rows()
+{
+    mat_43 m(vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+             vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+             vec_4(9.0f, 10.0f, 11.0f, 12.0f));
+    // row0 = (col0.x, col1.x, col2.x) = (1,5,9)
+    ASSERT_VEC3_EQ(m.get_row0(), 1.0f, 5.0f,  9.0f);
+    // row1 = (col0.y, col1.y, col2.y) = (2,6,10)
+    ASSERT_VEC3_EQ(m.get_row1(), 2.0f, 6.0f, 10.0f);
+    // row2 = (col0.z, col1.z, col2.z) = (3,7,11)
+    ASSERT_VEC3_EQ(m.get_row2(), 3.0f, 7.0f, 11.0f);
+    // row3 = (col0.w, col1.w, col2.w) = (4,8,12)
+    ASSERT_VEC3_EQ(m.get_row3(), 4.0f, 8.0f, 12.0f);
+    return 0;
+}
+
+// ---- mat_34 basic construction and row access ----
+
+int test_mat34_set_zero()
+{
+    mat_34 m;
+    m.set_zero();
+    ASSERT_VEC_EQ(m.get_row0(), 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(m.get_row1(), 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(m.get_row2(), 0.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat34_get_rows()
+{
+    mat_34 m(vec_3(1.0f, 2.0f, 3.0f),
+             vec_3(4.0f, 5.0f, 6.0f),
+             vec_3(7.0f, 8.0f, 9.0f),
+             vec_3(10.0f, 11.0f, 12.0f));
+    // row0 = (col0.x, col1.x, col2.x, col3.x) = (1,4,7,10)
+    ASSERT_VEC_EQ(m.get_row0(), 1.0f,  4.0f,  7.0f, 10.0f);
+    // row1 = (col0.y, col1.y, col2.y, col3.y) = (2,5,8,11)
+    ASSERT_VEC_EQ(m.get_row1(), 2.0f,  5.0f,  8.0f, 11.0f);
+    // row2 = (col0.z, col1.z, col2.z, col3.z) = (3,6,9,12)
+    ASSERT_VEC_EQ(m.get_row2(), 3.0f,  6.0f,  9.0f, 12.0f);
+    return 0;
+}
+
+// ---- mat_43 transpose ----
+
+int test_mat43_transpose()
+{
+    // transpose of mat_43 should equal mat_34
+    mat_43 m(vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+             vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+             vec_4(9.0f, 10.0f, 11.0f, 12.0f));
+    mat_34 t = m.transpose();
+    // t.col_i should be the i-th row of m
+    // t.col0 (vec_3) = row0 of m = (1,5,9)
+    // t.col1 = row1 = (2,6,10), t.col2 = row2 = (3,7,11), t.col3 = row3 = (4,8,12)
+    ASSERT_VEC_EQ(t.get_row0(), 1.0f, 2.0f,  3.0f,  4.0f);
+    ASSERT_VEC_EQ(t.get_row1(), 5.0f, 6.0f,  7.0f,  8.0f);
+    ASSERT_VEC_EQ(t.get_row2(), 9.0f, 10.0f, 11.0f, 12.0f);
+    return 0;
+}
+
+// ---- mat_34 transpose ----
+
+int test_mat34_transpose()
+{
+    mat_34 m(vec_3(1.0f, 2.0f, 3.0f),
+             vec_3(4.0f, 5.0f, 6.0f),
+             vec_3(7.0f, 8.0f, 9.0f),
+             vec_3(10.0f, 11.0f, 12.0f));
+    mat_43 t = m.transpose();
+    // t row0 = col0 of original m viewed as row = (1,4,7,10)
+    // But t is mat_43 (4 rows, 3 cols after transpose)
+    // t.get_row0() = (col0.x_of_t, col1.x_of_t, col2.x_of_t)
+    // where col_i of t = row_i of m
+    // row0 of m = (1,4,7,10) → col0 of t = vec_4(1,4,7,10)
+    // row1 of m = (2,5,8,11) → col1 of t
+    // row2 of m = (3,6,9,12) → col2 of t
+    // So t.get_row0() = (col0.x, col1.x, col2.x) = (1, 2, 3)
+    ASSERT_VEC3_EQ(t.get_row0(), 1.0f, 2.0f,  3.0f);
+    ASSERT_VEC3_EQ(t.get_row1(), 4.0f, 5.0f,  6.0f);
+    ASSERT_VEC3_EQ(t.get_row2(), 7.0f, 8.0f,  9.0f);
+    ASSERT_VEC3_EQ(t.get_row3(), 10.0f, 11.0f, 12.0f);
+    return 0;
+}
+
+// ---- mat_44 rotation ----
+
+// ---- mat_43 trans_mult tests ----
+
+int test_mat43_trans_mult_vec4()
+{
+    // mat_43 with cols that have non-zero .w component
+    mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 2.0f),
+             vec_4(0.0f, 1.0f, 0.0f, 3.0f),
+             vec_4(0.0f, 0.0f, 1.0f, 4.0f));
+    vec_4 v(1.0f, 1.0f, 1.0f, 1.0f);
+    vec_3 r = m.trans_mult(v);
+    // result.i = dot4(col_i, v)
+    // result.x = 1+0+0+2 = 3
+    // result.y = 0+1+0+3 = 4
+    // result.z = 0+0+1+4 = 5
+    ASSERT_VEC3_EQ(r, 3.0f, 4.0f, 5.0f);
+    return 0;
+}
+
+int test_mat43_trans_mult_vector_t()
+{
+    // Identity-like mat_43 (col.w = 0)
+    mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 0.0f, 1.0f, 0.0f));
+    vector_t v(2.0f, 3.0f, 4.0f);
+    vec_3 r = m.trans_mult(v);
+    // result.i = dot3(col_i.xyz, v.xyz)
+    ASSERT_VEC3_EQ(r, 2.0f, 3.0f, 4.0f);
+    return 0;
+}
+
+int test_mat43_trans_mult_point_t()
+{
+    // mat_43 with translation in .w
+    mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 5.0f),
+             vec_4(0.0f, 1.0f, 0.0f, 6.0f),
+             vec_4(0.0f, 0.0f, 1.0f, 7.0f));
+    point_t p(1.0f, 2.0f, 3.0f);
+    vec_3 r = m.trans_mult(p);
+    // result.i = dot3(col_i.xyz, p.xyz) + col_i.w
+    // result.x = 1 + 5 = 6
+    // result.y = 2 + 6 = 8
+    // result.z = 3 + 7 = 10
+    ASSERT_VEC3_EQ(r, 6.0f, 8.0f, 10.0f);
+    return 0;
+}
+
+// ---- mat_34 trans_mult ----
+
+int test_mat34_trans_mult_vec3()
+{
+    // mat_34: identity 3x3 block + col3=(5,6,7)
+    mat_34 m(vec_3(1.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 1.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 1.0f),
+             vec_3(5.0f, 6.0f, 7.0f));
+    vec_3 v(1.0f, 2.0f, 3.0f);
+    vec_4 r = m.trans_mult(v);
+    // result.i = dot3(col_i, v)
+    // result.x = 1, result.y = 2, result.z = 3, result.w = 5+12+21 = 38
+    ASSERT_VEC_EQ(r, 1.0f, 2.0f, 3.0f, 38.0f);
+    return 0;
+}
+
+// ---- row-vector * matrix tests ----
+
+int test_vec3_multiply_mat33()
+{
+    // mat_33 that swaps x and z components
+    mat_33 m(vec_3(0.0f, 0.0f, 1.0f),
+             vec_3(0.0f, 1.0f, 0.0f),
+             vec_3(1.0f, 0.0f, 0.0f));
+    vec_xyz v(1.0f, 2.0f, 3.0f);
+    vec_xyz r = v * m;
+    // result.i = dot3(v, col_i)
+    // result.x = dot3((1,2,3),(0,0,1)) = 3
+    // result.y = dot3((1,2,3),(0,1,0)) = 2
+    // result.z = dot3((1,2,3),(1,0,0)) = 1
+    ASSERT_VEC3_EQ(r, 3.0f, 2.0f, 1.0f);
+    return 0;
+}
+
+int test_vec3_multiply_mat34()
+{
+    // mat_34 identity block + col3=(4,5,6)
+    mat_34 m(vec_3(1.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 1.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 1.0f),
+             vec_3(4.0f, 5.0f, 6.0f));
+    vec_xyz v(1.0f, 2.0f, 3.0f);
+    vec_xyzw r = v * m;
+    // result.i = dot3(v, col_i)
+    // result.x = 1, result.y = 2, result.z = 3, result.w = 4+10+18 = 32
+    ASSERT_VEC_EQ(r, 1.0f, 2.0f, 3.0f, 32.0f);
+    return 0;
+}
+
+int test_vec4_multiply_mat43()
+{
+    // Identity mat_43
+    mat_43 m(vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 0.0f, 1.0f, 0.0f));
+    vec_xyzw v(2.0f, 3.0f, 4.0f, 5.0f);
+    vec_xyz r = v * m;
+    // result.i = dot4(v, col_i); w component of identity cols is 0
+    ASSERT_VEC3_EQ(r, 2.0f, 3.0f, 4.0f);
+    return 0;
+}
+
+int test_vec4_multiply_mat44()
+{
+    // Diagonal scale mat_44 (2,3,4,1)
+    mat_44 m(vec_4(2.0f, 0.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 3.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 0.0f, 4.0f, 0.0f),
+             vec_4(0.0f, 0.0f, 0.0f, 1.0f));
+    vec_xyzw v(1.0f, 1.0f, 1.0f, 0.0f);
+    vec_xyzw r = v * m;
+    ASSERT_VEC_EQ(r, 2.0f, 3.0f, 4.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_44 trans_mult tests ----
+
+int test_mat44_trans_mult_vec4()
+{
+    // Diagonal scale mat_44 (2,3,4,1)
+    mat_44 m(vec_4(2.0f, 0.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 3.0f, 0.0f, 0.0f),
+             vec_4(0.0f, 0.0f, 4.0f, 0.0f),
+             vec_4(0.0f, 0.0f, 0.0f, 1.0f));
+    vec_4 v(1.0f, 2.0f, 3.0f, 4.0f);
+    vec_4 r = m.trans_mult(v);
+    // result.i = dot4(col_i, v)
+    // result.x = 2*1 = 2, result.y = 3*2 = 6, result.z = 4*3 = 12, result.w = 1*4 = 4
+    ASSERT_VEC_EQ(r, 2.0f, 6.0f, 12.0f, 4.0f);
+    return 0;
+}
+
+int test_mat44_trans_mult_vector_t()
+{
+    // Diagonal scale + col3=(10,20,30,1)
+    mat_44 m(vec_4(2.0f,  0.0f,  0.0f, 0.0f),
+             vec_4(0.0f,  3.0f,  0.0f, 0.0f),
+             vec_4(0.0f,  0.0f,  4.0f, 0.0f),
+             vec_4(10.0f, 20.0f, 30.0f, 1.0f));
+    vector_t v(1.0f, 1.0f, 1.0f);
+    vec_4 r = m.trans_mult(v);
+    // result.i = dot3(col_i.xyz, v.xyz)
+    // result.x = 2, result.y = 3, result.z = 4, result.w = 10+20+30 = 60
+    ASSERT_VEC_EQ(r, 2.0f, 3.0f, 4.0f, 60.0f);
+    return 0;
+}
+
+int test_mat44_trans_mult_point_t()
+{
+    // Identity + col.w translation (5,6,7,0)
+    mat_44 m(vec_4(1.0f, 0.0f, 0.0f, 5.0f),
+             vec_4(0.0f, 1.0f, 0.0f, 6.0f),
+             vec_4(0.0f, 0.0f, 1.0f, 7.0f),
+             vec_4(0.0f, 0.0f, 0.0f, 1.0f));
+    point_t p(1.0f, 2.0f, 3.0f);
+    vec_4 r = m.trans_mult(p);
+    // result.i = dot4(col_i, p) where p.w=1
+    // result.x = 1+5 = 6, result.y = 2+6 = 8, result.z = 3+7 = 10, result.w = 1
+    ASSERT_VEC_EQ(r, 6.0f, 8.0f, 10.0f, 1.0f);
+    return 0;
+}
+
+int test_mat44_set_rotate_x()
+{
+    // 90-degree rotation around X: y→z, z→-y
+    mat_44 M;
+    M.set_rotate_x(3.14159265358979f / 2.0f);
+    // M * (1,0,0,0) = (1,0,0,0) (x-axis unchanged)
+    vec_4 ax(1.0f, 0.0f, 0.0f, 0.0f);
+    vec_4 rx = M * ax;
+    ASSERT_VEC_EQ(rx, 1.0f, 0.0f, 0.0f, 0.0f);
+    // M * (0,1,0,0) = (0,0,1,0) (y→z)
+    vec_4 ay(0.0f, 1.0f, 0.0f, 0.0f);
+    vec_4 ry = M * ay;
+    ASSERT_VEC_EQ(ry, 0.0f, 0.0f, 1.0f, 0.0f);
+    return 0;
+}
+
+int test_mat44_set_rotate_z()
+{
+    // 90-degree rotation around Z: x→y, y→-x
+    mat_44 M;
+    M.set_rotate_z(3.14159265358979f / 2.0f);
+    // M * (1,0,0,0) = (0,1,0,0) (x→y)
+    vec_4 ax(1.0f, 0.0f, 0.0f, 0.0f);
+    vec_4 rx = M * ax;
+    ASSERT_VEC_EQ(rx, 0.0f, 1.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_33 trans_mult(vec_3) ----
+
+int test_mat33_trans_mult_vec3()
+{
+    // M^T * v = (dot(col0,v), dot(col1,v), dot(col2,v))
+    // M: diagonal scale col0=(1,0,0), col1=(0,2,0), col2=(0,0,3)
+    mat_33 M;
+    M.col0.set(1.0f, 0.0f, 0.0f);
+    M.col1.set(0.0f, 2.0f, 0.0f);
+    M.col2.set(0.0f, 0.0f, 3.0f);
+    vec_3 v(1.0f, 1.0f, 1.0f);
+    vec_3 r = M.trans_mult(v);
+    ASSERT_VEC3_EQ(r, 1.0f, 2.0f, 3.0f);
+    return 0;
+}
+
+// ---- mat_33 rotation tests ----
+
+int test_mat33_set_rotate_x()
+{
+    // 90-degree rotation around X: y→z, z→-y
+    mat_33 M;
+    M.set_rotate_x(3.14159265358979f / 2.0f);
+    vec_3 ry = M * vec_3(0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(ry, 0.0f, 0.0f, 1.0f);
+    vec_3 rz = M * vec_3(0.0f, 0.0f, 1.0f);
+    ASSERT_VEC3_EQ(rz, 0.0f, -1.0f, 0.0f);
+    return 0;
+}
+
+int test_mat33_set_rotate_y()
+{
+    // 90-degree rotation around Y: x→-z, z→x
+    mat_33 M;
+    M.set_rotate_y(3.14159265358979f / 2.0f);
+    vec_3 rx = M * vec_3(1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(rx, 0.0f, 0.0f, -1.0f);
+    vec_3 rz = M * vec_3(0.0f, 0.0f, 1.0f);
+    ASSERT_VEC3_EQ(rz, 1.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat33_set_rotate_z()
+{
+    // 90-degree rotation around Z: x→y, y→-x
+    mat_33 M;
+    M.set_rotate_z(3.14159265358979f / 2.0f);
+    vec_3 rx = M * vec_3(1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(rx, 0.0f, 1.0f, 0.0f);
+    vec_3 ry = M * vec_3(0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(ry, -1.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_33 add/subtract ----
+
+int test_mat33_add()
+{
+    mat_33 A, B;
+    A.col0.set(1.0f, 2.0f, 3.0f);
+    A.col1.set(4.0f, 5.0f, 6.0f);
+    A.col2.set(7.0f, 8.0f, 9.0f);
+    B.col0.set(9.0f, 8.0f, 7.0f);
+    B.col1.set(6.0f, 5.0f, 4.0f);
+    B.col2.set(3.0f, 2.0f, 1.0f);
+    mat_33 C = A + B;
+    ASSERT_VEC3_EQ(C.get_row0(), 10.0f, 10.0f, 10.0f);
+    ASSERT_VEC3_EQ(C.get_row1(), 10.0f, 10.0f, 10.0f);
+    ASSERT_VEC3_EQ(C.get_row2(), 10.0f, 10.0f, 10.0f);
+    return 0;
+}
+
+int test_mat33_subtract()
+{
+    mat_33 A, B;
+    A.col0.set(5.0f, 5.0f, 5.0f);
+    A.col1.set(5.0f, 5.0f, 5.0f);
+    A.col2.set(5.0f, 5.0f, 5.0f);
+    B.col0.set(1.0f, 2.0f, 3.0f);
+    B.col1.set(4.0f, 5.0f, 6.0f);
+    B.col2.set(7.0f, 8.0f, 9.0f);
+    mat_33 C = A - B;
+    // C.col0=(4,3,2), col1=(1,0,-1), col2=(-2,-3,-4)
+    ASSERT_VEC3_EQ(C.get_row0(), 4.0f, 1.0f, -2.0f);
+    ASSERT_VEC3_EQ(C.get_row1(), 3.0f, 0.0f, -3.0f);
+    ASSERT_VEC3_EQ(C.get_row2(), 2.0f, -1.0f, -4.0f);
+    return 0;
+}
+
+// ---- mat_33 trans_mult(mat_33) ----
+
+int test_mat33_trans_mult_mat33()
+{
+    // M.trans_mult(A) = M^T * A; for M=diag(2,3,4), result col_i = (2*A.col_i.x, 3*A.col_i.y, 4*A.col_i.z)
+    mat_33 M;
+    M.col0.set(2.0f, 0.0f, 0.0f);
+    M.col1.set(0.0f, 3.0f, 0.0f);
+    M.col2.set(0.0f, 0.0f, 4.0f);
+    mat_33 I;
+    I.set_identity();
+    mat_33 R = M.trans_mult(I);
+    // R.col_i = trans_mult(e_i): col0=(2,0,0), col1=(0,3,0), col2=(0,0,4)
+    ASSERT_VEC3_EQ(R.get_row0(), 2.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(R.get_row1(), 0.0f, 3.0f, 0.0f);
+    ASSERT_VEC3_EQ(R.get_row2(), 0.0f, 0.0f, 4.0f);
+    return 0;
+}
+
+// ---- mat_33 from quaternion ----
+
+int test_mat33_from_quat()
+{
+    // Identity quaternion (0,0,0,1) → identity matrix
+    vec_4 q_id(0.0f, 0.0f, 0.0f, 1.0f);
+    mat_33 M_id(q_id);
+    ASSERT_VEC3_EQ(M_id.get_row0(), 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(M_id.get_row1(), 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(M_id.get_row2(), 0.0f, 0.0f, 1.0f);
+    // Quaternion for 90° around Z: (0, 0, sin45, cos45) → rotate x→y, y→-x
+    float s = sinf(3.14159265358979f / 4.0f);
+    float c = cosf(3.14159265358979f / 4.0f);
+    vec_4 q_z(0.0f, 0.0f, s, c);
+    mat_33 M_z(q_z);
+    // row0 = (0,-1,0), row1 = (1,0,0), row2 = (0,0,1)
+    ASSERT_VEC3_EQ(M_z.get_row0(), 0.0f, -1.0f, 0.0f);
+    ASSERT_VEC3_EQ(M_z.get_row1(), 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(M_z.get_row2(), 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// ---- mat_33 mult_tilde ----
+
+int test_mat33_mult_tilde()
+{
+    // identity.mult_tilde(v) = skew(v)
+    // skew((1,2,3)).col0=(0,3,-2), col1=(-3,0,1), col2=(2,-1,0)
+    mat_33 I;
+    I.set_identity();
+    vec_3 v(1.0f, 2.0f, 3.0f);
+    mat_33 R = I.mult_tilde(v);
+    // row0 = (0,-3,2), row1 = (3,0,-1), row2 = (-2,1,0)
+    ASSERT_VEC3_EQ(R.get_row0(), 0.0f, -3.0f, 2.0f);
+    ASSERT_VEC3_EQ(R.get_row1(), 3.0f, 0.0f, -1.0f);
+    ASSERT_VEC3_EQ(R.get_row2(), -2.0f, 1.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_44 rotation (y) ----
+
+int test_mat44_set_rotate_y()
+{
+    // 90-degree rotation around Y: x→-z, z→x
+    mat_44 M;
+    M.set_rotate_y(3.14159265358979f / 2.0f);
+    vec_4 ax(1.0f, 0.0f, 0.0f, 0.0f);
+    vec_4 rx = M * ax;
+    ASSERT_VEC_EQ(rx, 0.0f, 0.0f, -1.0f, 0.0f);
+    vec_4 az(0.0f, 0.0f, 1.0f, 0.0f);
+    vec_4 rz = M * az;
+    ASSERT_VEC_EQ(rz, 1.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_44 add/subtract ----
+
+int test_mat44_add()
+{
+    mat_44 A, B;
+    A.set_identity();
+    B.set_identity();
+    mat_44 C = A + B;
+    // C = 2*I
+    ASSERT_VEC_EQ(C.get_row0(), 2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(C.get_row1(), 0.0f, 2.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(C.get_row2(), 0.0f, 0.0f, 2.0f, 0.0f);
+    ASSERT_VEC_EQ(C.get_row3(), 0.0f, 0.0f, 0.0f, 2.0f);
+    return 0;
+}
+
+int test_mat44_subtract()
+{
+    mat_44 A;
+    A.set_identity();
+    mat_44 B;
+    B.col0.set(1.0f, 0.0f, 0.0f, 0.0f);
+    B.col1.set(0.0f, 2.0f, 0.0f, 0.0f);
+    B.col2.set(0.0f, 0.0f, 3.0f, 0.0f);
+    B.col3.set(0.0f, 0.0f, 0.0f, 4.0f);
+    mat_44 C = A - B;
+    // I - diag(1,2,3,4) = diag(0,-1,-2,-3)
+    ASSERT_VEC_EQ(C.get_row0(), 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(C.get_row1(), 0.0f, -1.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(C.get_row2(), 0.0f, 0.0f, -2.0f, 0.0f);
+    ASSERT_VEC_EQ(C.get_row3(), 0.0f, 0.0f, 0.0f, -3.0f);
+    return 0;
+}
+
+// ---- mat_44 trans_mult(mat_44) ----
+
+int test_mat44_trans_mult_mat44()
+{
+    // diag(2,3,4,1).trans_mult(I) = diag(2,3,4,1)
+    mat_44 M;
+    M.col0.set(2.0f, 0.0f, 0.0f, 0.0f);
+    M.col1.set(0.0f, 3.0f, 0.0f, 0.0f);
+    M.col2.set(0.0f, 0.0f, 4.0f, 0.0f);
+    M.col3.set(0.0f, 0.0f, 0.0f, 1.0f);
+    mat_44 I;
+    I.set_identity();
+    mat_44 R = M.trans_mult(I);
+    ASSERT_VEC_EQ(R.get_row0(), 2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(R.get_row1(), 0.0f, 3.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(R.get_row2(), 0.0f, 0.0f, 4.0f, 0.0f);
+    ASSERT_VEC_EQ(R.get_row3(), 0.0f, 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// ---- mat_44 * mat_43 ----
+
+int test_mat44_multiply_mat43()
+{
+    // I44 * identity_43 = identity_43
+    mat_44 I44;
+    I44.set_identity();
+    mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 0.0f, 1.0f, 0.0f));
+    mat_43 R = I44 * I43;
+    vec_xyzw c0 = R.get_col0();
+    vec_xyzw c1 = R.get_col1();
+    vec_xyzw c2 = R.get_col2();
+    ASSERT_VEC_EQ(c0, 1.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(c1, 0.0f, 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(c2, 0.0f, 0.0f, 1.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_43 negate ----
+
+int test_mat43_negate()
+{
+    mat_43 M(vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+             vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+             vec_4(9.0f, 10.0f, 11.0f, 12.0f));
+    mat_43 N = -M;
+    vec_xyzw c0 = N.get_col0();
+    vec_xyzw c1 = N.get_col1();
+    vec_xyzw c2 = N.get_col2();
+    ASSERT_VEC_EQ(c0, -1.0f, -2.0f, -3.0f, -4.0f);
+    ASSERT_VEC_EQ(c1, -5.0f, -6.0f, -7.0f, -8.0f);
+    ASSERT_VEC_EQ(c2, -9.0f, -10.0f, -11.0f, -12.0f);
+    return 0;
+}
+
+// ---- mat_43 add/subtract ----
+
+int test_mat43_add()
+{
+    mat_43 A(vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+             vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+             vec_4(9.0f, 10.0f, 11.0f, 12.0f));
+    mat_43 B(vec_4(-1.0f, -2.0f, -3.0f, -4.0f),
+             vec_4(-5.0f, -6.0f, -7.0f, -8.0f),
+             vec_4(-9.0f, -10.0f, -11.0f, -12.0f));
+    mat_43 C = A + B;
+    vec_xyzw c0 = C.get_col0();
+    vec_xyzw c1 = C.get_col1();
+    vec_xyzw c2 = C.get_col2();
+    ASSERT_VEC_EQ(c0, 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(c1, 0.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(c2, 0.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_mat43_subtract()
+{
+    mat_43 A(vec_4(5.0f, 5.0f, 5.0f, 5.0f),
+             vec_4(5.0f, 5.0f, 5.0f, 5.0f),
+             vec_4(5.0f, 5.0f, 5.0f, 5.0f));
+    mat_43 B(vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+             vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+             vec_4(1.0f, 2.0f, 3.0f, 4.0f));
+    mat_43 C = A - B;
+    vec_xyzw c0 = C.get_col0();
+    ASSERT_VEC_EQ(c0, 4.0f, 3.0f, 2.0f, 1.0f);
+    return 0;
+}
+
+// ---- mat_43 * mat_33 ----
+
+int test_mat43_multiply_mat33()
+{
+    // identity_43 * identity_33 = identity_43
+    mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 0.0f, 1.0f, 0.0f));
+    mat_33 I33;
+    I33.set_identity();
+    mat_43 R = I43 * I33;
+    vec_xyzw c0 = R.get_col0();
+    vec_xyzw c1 = R.get_col1();
+    vec_xyzw c2 = R.get_col2();
+    ASSERT_VEC_EQ(c0, 1.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(c1, 0.0f, 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(c2, 0.0f, 0.0f, 1.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_43 mult_tilde ----
+
+int test_mat43_mult_tilde()
+{
+    // identity_43.mult_tilde(v) columns = skew(v) padded to vec_4
+    mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 0.0f, 1.0f, 0.0f));
+    vec_3 v(1.0f, 2.0f, 3.0f);
+    mat_43 R = I43.mult_tilde(v);
+    // col0=(0,3,-2,0), col1=(-3,0,1,0), col2=(2,-1,0,0)
+    vec_xyzw c0 = R.get_col0();
+    vec_xyzw c1 = R.get_col1();
+    vec_xyzw c2 = R.get_col2();
+    ASSERT_VEC_EQ(c0, 0.0f, 3.0f, -2.0f, 0.0f);
+    ASSERT_VEC_EQ(c1, -3.0f, 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC_EQ(c2, 2.0f, -1.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// ---- mat_34 negate ----
+
+int test_mat34_negate()
+{
+    mat_34 M(vec_3(1.0f, 2.0f, 3.0f),
+             vec_3(4.0f, 5.0f, 6.0f),
+             vec_3(7.0f, 8.0f, 9.0f),
+             vec_3(10.0f, 11.0f, 12.0f));
+    mat_34 N = -M;
+    ASSERT_VEC3_EQ(N.get_row0(), -1.0f, -4.0f, -7.0f);
+    ASSERT_VEC3_EQ(N.get_row1(), -2.0f, -5.0f, -8.0f);
+    ASSERT_VEC3_EQ(N.get_row2(), -3.0f, -6.0f, -9.0f);
+    return 0;
+}
+
+// ---- mat_34 add/subtract ----
+
+int test_mat34_add()
+{
+    mat_34 A(vec_3(1.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 1.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 1.0f),
+             vec_3(0.0f, 0.0f, 0.0f));
+    mat_34 B(vec_3(0.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 0.0f),
+             vec_3(4.0f, 5.0f, 6.0f));
+    mat_34 C = A + B;
+    ASSERT_VEC3_EQ(C.get_row0(), 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(C.get_row1(), 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(C.get_row2(), 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+int test_mat34_subtract()
+{
+    mat_34 A(vec_3(2.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 2.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 2.0f),
+             vec_3(0.0f, 0.0f, 0.0f));
+    mat_34 B(vec_3(1.0f, 0.0f, 0.0f),
+             vec_3(0.0f, 1.0f, 0.0f),
+             vec_3(0.0f, 0.0f, 1.0f),
+             vec_3(0.0f, 0.0f, 0.0f));
+    mat_34 C = A - B;
+    ASSERT_VEC3_EQ(C.get_row0(), 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(C.get_row1(), 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(C.get_row2(), 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// ---- mat_34 * mat_43 ----
+
+int test_mat34_multiply_mat43()
+{
+    // I34 (with col3=0) * I43 = I33
+    mat_34 I34(vec_3(1.0f, 0.0f, 0.0f),
+               vec_3(0.0f, 1.0f, 0.0f),
+               vec_3(0.0f, 0.0f, 1.0f),
+               vec_3(0.0f, 0.0f, 0.0f));
+    mat_43 I43(vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+               vec_4(0.0f, 0.0f, 1.0f, 0.0f));
+    mat_33 R = I34 * I43;
+    ASSERT_VEC3_EQ(R.get_row0(), 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(R.get_row1(), 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(R.get_row2(), 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// ---- mat_34 * mat_44 ----
+
+int test_mat34_multiply_mat44()
+{
+    // I34 * I44 = I34
+    mat_34 I34(vec_3(1.0f, 0.0f, 0.0f),
+               vec_3(0.0f, 1.0f, 0.0f),
+               vec_3(0.0f, 0.0f, 1.0f),
+               vec_3(0.0f, 0.0f, 0.0f));
+    mat_44 I44;
+    I44.set_identity();
+    mat_34 R = I34 * I44;
+    ASSERT_VEC3_EQ(R.get_row0(), 1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(R.get_row1(), 0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(R.get_row2(), 0.0f, 0.0f, 1.0f);
+    return 0;
+}
+
+// ---- mat_33::inverse ----
+
+int test_mat33_inverse()
+{
+    // Diagonal matrix diag(2,3,4): inverse = diag(0.5, 1/3, 0.25)
+    mat_33 M;
+    M.col0.set(2.0f, 0.0f, 0.0f);
+    M.col1.set(0.0f, 3.0f, 0.0f);
+    M.col2.set(0.0f, 0.0f, 4.0f);
+    mat_33 inv = M.inverse();
+    ASSERT_VEC3_EQ(inv.get_row0(), 0.5f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(inv.get_row1(), 0.0f, 1.0f/3.0f, 0.0f);
+    ASSERT_VEC3_EQ(inv.get_row2(), 0.0f, 0.0f, 0.25f);
+    return 0;
+}
+
+// ---- vec_xyz::operator~() tilde matrix ----
+
+int test_vec_xyz_tilde()
+{
+    // ~(1,2,3) = skew-symmetric matrix
+    // col0=(0,3,-2), col1=(-3,0,1), col2=(2,-1,0)
+    vec_3 v(1.0f, 2.0f, 3.0f);
+    mat_33 T = ~v;
+    ASSERT_VEC3_EQ(T.get_row0(), 0.0f, -3.0f, 2.0f);
+    ASSERT_VEC3_EQ(T.get_row1(), 3.0f, 0.0f, -1.0f);
+    ASSERT_VEC3_EQ(T.get_row2(), -2.0f, 1.0f, 0.0f);
+    return 0;
+}
+
+// Helper: 90-degree Z rotation + translation(4,5,6)
+static void make_test_transform(transform_t& T)
+{
+    // Write Rz(90 deg) + translation(4,5,6) using individual float stores
+    // to avoid R5900 GCC TI-mode register-pair bug (stale upper register with sd)
+    float *p = reinterpret_cast<float*>(&T);
+    // col0 = (0, 1, 0, 0)
+    p[0] = 0.0f;  p[1] = 1.0f;  p[2] = 0.0f;  p[3] = 0.0f;
+    // col1 = (-1, 0, 0, 0)
+    p[4] = -1.0f; p[5] = 0.0f;  p[6] = 0.0f;  p[7] = 0.0f;
+    // col2 = (0, 0, 1, 0)
+    p[8] = 0.0f;  p[9] = 0.0f;  p[10] = 1.0f; p[11] = 0.0f;
+    // col3 = (4, 5, 6, 1)
+    p[12] = 4.0f; p[13] = 5.0f; p[14] = 6.0f; p[15] = 1.0f;
+}
+
+// ---- transform_t::operator*(vec_4) ----
+
+int test_transform_mult_vec4()
+{
+    transform_t T;
+    make_test_transform(T);
+    // T * (1,2,3,1): R*(1,2,3) + t = (0+(-2)+0+4, 1+0+0+5, 0+0+3+6, 1) = (2,6,9,1)
+    vec_4 r = T * vec_4(1.0f, 2.0f, 3.0f, 1.0f);
+    ASSERT_VEC_EQ(r, 2.0f, 6.0f, 9.0f, 1.0f);
+    // T * (1,0,0,0): direction, translation not applied
+    vec_4 r2 = T * vec_4(1.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_VEC_EQ(r2, 0.0f, 1.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// ---- transform_t::operator*(vector_t) ----
+
+int test_transform_mult_vector()
+{
+    transform_t T;
+    make_test_transform(T);
+    // T * (1,0,0) = col0 = (0,1,0)
+    vector_t r = T * vector_t(1.0f, 0.0f, 0.0f);
+    ASSERT_VEC3_EQ(r, 0.0f, 1.0f, 0.0f);
+    // T * (0,1,0) = col1 = (-1,0,0)
+    vector_t r2 = T * vector_t(0.0f, 1.0f, 0.0f);
+    ASSERT_VEC3_EQ(r2, -1.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+// ---- transform_t::operator*(point_t) ----
+
+int test_transform_mult_point()
+{
+    transform_t T;
+    make_test_transform(T);
+    // T * (1,2,3) = R*(1,2,3) + t = (2,6,9)
+    point_t r = T * point_t(1.0f, 2.0f, 3.0f);
+    ASSERT_VEC3_EQ(r, 2.0f, 6.0f, 9.0f);
+    return 0;
+}
+
+// ---- transform_t::inverse() ----
+
+int test_transform_inverse()
+{
+    transform_t T;
+    make_test_transform(T);
+    transform_t inv = T.inverse();
+    // R_z(90)^{-1} = R_z(-90): col0=(0,-1,0), col1=(1,0,0), col2=(0,0,1)
+    // -R^T * t = -(0*4+1*5+0*6, -1*4+0*5+0*6, 0*4+0*5+1*6) = (-5, 4, -6)
+    // get_rowN = (colN.x, colN.y, colN.z) packed, last = col3.N
+    ASSERT_VEC_EQ(inv.get_row0(), 0.0f, 1.0f, 0.0f, -5.0f);
+    ASSERT_VEC_EQ(inv.get_row1(), -1.0f, 0.0f, 0.0f, 4.0f);
+    ASSERT_VEC_EQ(inv.get_row2(), 0.0f, 0.0f, 1.0f, -6.0f);
+    return 0;
+}
+
+// ---- transform_t::orthonormal_inverse() ----
+
+int test_transform_orthonormal_inverse()
+{
+    transform_t T;
+    make_test_transform(T);
+    transform_t inv = T.orthonormal_inverse();
+    ASSERT_VEC_EQ(inv.get_row0(), 0.0f, 1.0f, 0.0f, -5.0f);
+    ASSERT_VEC_EQ(inv.get_row1(), -1.0f, 0.0f, 0.0f, 4.0f);
+    ASSERT_VEC_EQ(inv.get_row2(), 0.0f, 0.0f, 1.0f, -6.0f);
+    return 0;
+}
+
+// ---- transform_t::orthonormal_inverse_in_place() ----
+
+int test_transform_orthonormal_inverse_in_place()
+{
+    transform_t T;
+    make_test_transform(T);
+    T.orthonormal_inverse_in_place();
+    ASSERT_VEC_EQ(T.get_row0(), 0.0f, 1.0f, 0.0f, -5.0f);
+    ASSERT_VEC_EQ(T.get_row1(), -1.0f, 0.0f, 0.0f, 4.0f);
+    ASSERT_VEC_EQ(T.get_row2(), 0.0f, 0.0f, 1.0f, -6.0f);
+    return 0;
+}
+
+#ifndef USE_CPU_COMPAT
+// ============================================================================
+// VU0 asm fast-path tests for cpu_vec_4 and cpu_mat_44 operators.
+// These exercise the new VU0 macro-mode asm blocks added to cpu_vector.h /
+// cpu_matrix.h that are gated on !NO_VU0_VECTORS. In the USE_CPU_COMPAT build
+// the same operators take their scalar fallback path and are covered by the
+// legacy CPU tests below.
+// ============================================================================
+
+int test_cpu_vec4_vu0_addition() {
+    cpu_vec_4 a(1.0f, 2.0f, 3.0f, 4.0f), b(5.0f, 6.0f, 7.0f, 8.0f);
+    cpu_vec_4 r = a + b;
+    ASSERT_CPU_VEC4_EQ(r, 6.0f, 8.0f, 10.0f, 12.0f);
+    return 0;
+}
+
+int test_cpu_vec4_vu0_addition_signed() {
+    cpu_vec_4 a(-1.5f, 2.5f, -3.25f, 4.75f), b(1.5f, -2.5f, 3.25f, -4.75f);
+    cpu_vec_4 r = a + b;
+    ASSERT_CPU_VEC4_EQ(r, 0.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_cpu_vec4_vu0_subtraction() {
+    cpu_vec_4 a(10.0f, 20.0f, 30.0f, 40.0f), b(1.0f, 2.0f, 3.0f, 4.0f);
+    cpu_vec_4 r = a - b;
+    ASSERT_CPU_VEC4_EQ(r, 9.0f, 18.0f, 27.0f, 36.0f);
+    return 0;
+}
+
+int test_cpu_vec4_vu0_negation() {
+    cpu_vec_4 v(1.0f, -2.0f, 3.0f, -4.0f);
+    cpu_vec_4 r = -v;
+    ASSERT_CPU_VEC4_EQ(r, -1.0f, 2.0f, -3.0f, 4.0f);
+    return 0;
+}
+
+int test_cpu_vec4_vu0_negation_zero() {
+    cpu_vec_4 v(0.0f, 0.0f, 0.0f, 0.0f);
+    cpu_vec_4 r = -v;
+    ASSERT_CPU_VEC4_EQ(r, 0.0f, 0.0f, 0.0f, 0.0f);
+    return 0;
+}
+
+int test_cpu_vec4_vu0_component_mul() {
+    cpu_vec_4 a(2.0f, 3.0f, 4.0f, 5.0f), b(3.0f, 4.0f, 5.0f, 6.0f);
+    cpu_vec_4 r = a * b;
+    ASSERT_CPU_VEC4_EQ(r, 6.0f, 12.0f, 20.0f, 30.0f);
+    return 0;
+}
+
+int test_cpu_vec4_vu0_component_mul_signed() {
+    cpu_vec_4 a(-1.0f, 2.0f, -3.0f, 4.0f), b(2.0f, -3.0f, 4.0f, -5.0f);
+    cpu_vec_4 r = a * b;
+    ASSERT_CPU_VEC4_EQ(r, -2.0f, -6.0f, -12.0f, -20.0f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_vector_identity() {
+    cpu_mat_44 m; m.set_identity();
+    cpu_vec_4 v(1.5f, -2.5f, 3.5f, -4.5f);
+    cpu_vec_4 r = m * v;
+    ASSERT_CPU_VEC4_EQ(r, 1.5f, -2.5f, 3.5f, -4.5f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_vector_diag() {
+    cpu_mat_44 m(cpu_vec_4(2.0f, 0.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 3.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 0.0f, 4.0f, 0.0f),
+                 cpu_vec_4(0.0f, 0.0f, 0.0f, 5.0f));
+    cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f);
+    cpu_vec_4 r = m * v;
+    ASSERT_CPU_VEC4_EQ(r, 2.0f, 3.0f, 4.0f, 5.0f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_vector_affine() {
+    // Column-major: cols are basis vectors, last col is translation.
+    cpu_mat_44 m(cpu_vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 2.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 0.0f, 3.0f, 0.0f),
+                 cpu_vec_4(4.0f, 5.0f, 6.0f, 1.0f));
+    cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f);
+    cpu_vec_4 r = m * v;
+    ASSERT_CPU_VEC4_EQ(r, 5.0f, 7.0f, 9.0f, 1.0f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_vector_general() {
+    cpu_mat_44 m(cpu_vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+                 cpu_vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+                 cpu_vec_4(9.0f, 10.0f, 11.0f, 12.0f),
+                 cpu_vec_4(13.0f, 14.0f, 15.0f, 16.0f));
+    cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f);
+    cpu_vec_4 r = m * v;
+    // Sum of columns
+    ASSERT_CPU_VEC4_EQ(r, 28.0f, 32.0f, 36.0f, 40.0f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_mat_identity() {
+    cpu_mat_44 a(cpu_vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+                 cpu_vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+                 cpu_vec_4(9.0f, 10.0f, 11.0f, 12.0f),
+                 cpu_vec_4(13.0f, 14.0f, 15.0f, 16.0f));
+    cpu_mat_44 i; i.set_identity();
+    cpu_mat_44 r = a * i;
+    ASSERT_CPU_VEC4_EQ(r.get_col0(), 1.0f, 2.0f, 3.0f, 4.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col1(), 5.0f, 6.0f, 7.0f, 8.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col2(), 9.0f, 10.0f, 11.0f, 12.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col3(), 13.0f, 14.0f, 15.0f, 16.0f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_mat_scale_translate() {
+    cpu_mat_44 s; s.set_scale(cpu_vec_3(2.0f, 3.0f, 4.0f));
+    cpu_mat_44 t; t.set_translate(cpu_vec_3(1.0f, 2.0f, 3.0f));
+    cpu_mat_44 r = s * t;
+    // Scale then translate: col3 becomes (2,6,12,1); diag stays (2,3,4,1).
+    ASSERT_CPU_VEC4_EQ(r.get_col0(), 2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col1(), 0.0f, 3.0f, 0.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col2(), 0.0f, 0.0f, 4.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col3(), 2.0f, 6.0f, 12.0f, 1.0f);
+    return 0;
+}
+
+int test_cpu_mat44_vu0_mult_mat_general() {
+    // Verify a non-diagonal multiply against a hand-computed result.
+    // A column-major: cols [(1,0,0,0), (0,1,0,0), (1,1,1,0), (0,0,0,1)]
+    // B column-major: cols [(2,0,0,0), (0,2,0,0), (0,0,2,0), (1,1,1,1)]
+    // R = A * B (column-major): col_k = A * B.col_k
+    cpu_mat_44 A(cpu_vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 1.0f, 0.0f, 0.0f),
+                 cpu_vec_4(1.0f, 1.0f, 1.0f, 0.0f),
+                 cpu_vec_4(0.0f, 0.0f, 0.0f, 1.0f));
+    cpu_mat_44 B(cpu_vec_4(2.0f, 0.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 2.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 0.0f, 2.0f, 0.0f),
+                 cpu_vec_4(1.0f, 1.0f, 1.0f, 1.0f));
+    cpu_mat_44 R = A * B;
+    ASSERT_CPU_VEC4_EQ(R.get_col0(), 2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(R.get_col1(), 0.0f, 2.0f, 0.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(R.get_col2(), 2.0f, 2.0f, 2.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(R.get_col3(), 2.0f, 2.0f, 1.0f, 1.0f);
+    return 0;
+}
+#endif // !USE_CPU_COMPAT
+
+#ifdef USE_CPU_COMPAT
+// ============================================================================
+// CPU legacy type tests (cpu_vec_3, cpu_vec_4, cpu_mat_44)
+// Only compiled in the USE_CPU_COMPAT build.
+// ============================================================================
+int test_cpu_vec3_construction() {
+    cpu_vec_3 v(1.0f, 2.0f, 3.0f);
+    ASSERT_CPU_VEC3_EQ(v, 1.0f, 2.0f, 3.0f); return 0; }
+
+int test_cpu_vec3_set() {
+    cpu_vec_3 v(0.0f, 0.0f, 0.0f);
+    v.set(4.0f, 5.0f, 6.0f);
+    ASSERT_CPU_VEC3_EQ(v, 4.0f, 5.0f, 6.0f); return 0; }
+
+int test_cpu_vec3_addition() {
+    cpu_vec_3 a(1.0f, 2.0f, 3.0f), b(4.0f, 5.0f, 6.0f);
+    cpu_vec_3 r = a + b;
+    ASSERT_CPU_VEC3_EQ(r, 5.0f, 7.0f, 9.0f); return 0; }
+
+int test_cpu_vec3_subtraction() {
+    cpu_vec_3 a(5.0f, 7.0f, 9.0f), b(1.0f, 2.0f, 3.0f);
+    cpu_vec_3 r = a - b;
+    ASSERT_CPU_VEC3_EQ(r, 4.0f, 5.0f, 6.0f); return 0; }
+
+int test_cpu_vec3_negation() {
+    cpu_vec_3 v(1.0f, -2.0f, 3.0f);
+    cpu_vec_3 r = -v;
+    ASSERT_CPU_VEC3_EQ(r, -1.0f, 2.0f, -3.0f); return 0; }
+
+int test_cpu_vec3_scalar_mul() {
+    cpu_vec_3 v(1.0f, 2.0f, 3.0f);
+    cpu_vec_3 r = v * 3.0f;
+    ASSERT_CPU_VEC3_EQ(r, 3.0f, 6.0f, 9.0f); return 0; }
+
+int test_cpu_vec3_dot() {
+    cpu_vec_3 a(1.0f, 2.0f, 3.0f), b(4.0f, 5.0f, 6.0f);
+    ASSERT_FLOAT_EQ(a.dot(b), 32.0f); return 0; }  // 1*4+2*5+3*6=32
+
+int test_cpu_vec3_cross() {
+    cpu_vec_3 a(1.0f, 0.0f, 0.0f), b(0.0f, 1.0f, 0.0f);
+    cpu_vec_3 r = a.cross(b);
+    ASSERT_CPU_VEC3_EQ(r, 0.0f, 0.0f, 1.0f); return 0; }
+
+int test_cpu_vec3_cross_general() {
+    cpu_vec_3 a(1.0f, 2.0f, 3.0f), b(4.0f, 5.0f, 6.0f);
+    cpu_vec_3 r = a.cross(b);
+    ASSERT_CPU_VEC3_EQ(r, -3.0f, 6.0f, -3.0f); return 0; }
+
+int test_cpu_vec3_normalized() {
+    cpu_vec_3 v(3.0f, 0.0f, 0.0f);
+    cpu_vec_3 r = v.normalized();
+    ASSERT_CPU_VEC3_EQ(r, 1.0f, 0.0f, 0.0f); return 0; }
+
+int test_cpu_vec3_normalize() {
+    cpu_vec_3 v(0.0f, 0.0f, 5.0f);
+    v.normalize();
+    ASSERT_CPU_VEC3_EQ(v, 0.0f, 0.0f, 1.0f); return 0; }
+
+int test_cpu_vec3_index_operator() {
+    cpu_vec_3 v(7.0f, 8.0f, 9.0f);
+    ASSERT_FLOAT_EQ(v(0), 7.0f);
+    ASSERT_FLOAT_EQ(v(1), 8.0f);
+    ASSERT_FLOAT_EQ(v(2), 9.0f); return 0; }
+
+int test_cpu_vec4_construction() {
+    cpu_vec_4 v(1.0f, 2.0f, 3.0f, 4.0f);
+    ASSERT_CPU_VEC4_EQ(v, 1.0f, 2.0f, 3.0f, 4.0f); return 0; }
+
+int test_cpu_vec4_set() {
+    cpu_vec_4 v(0.0f, 0.0f, 0.0f, 0.0f);
+    v.set(1.0f, 2.0f, 3.0f, 4.0f);
+    ASSERT_CPU_VEC4_EQ(v, 1.0f, 2.0f, 3.0f, 4.0f); return 0; }
+
+int test_cpu_vec4_addition() {
+    cpu_vec_4 a(1.0f, 2.0f, 3.0f, 4.0f), b(5.0f, 6.0f, 7.0f, 8.0f);
+    cpu_vec_4 r = a + b;
+    ASSERT_CPU_VEC4_EQ(r, 6.0f, 8.0f, 10.0f, 12.0f); return 0; }
+
+int test_cpu_vec4_subtraction() {
+    cpu_vec_4 a(5.0f, 6.0f, 7.0f, 8.0f), b(1.0f, 2.0f, 3.0f, 4.0f);
+    cpu_vec_4 r = a - b;
+    ASSERT_CPU_VEC4_EQ(r, 4.0f, 4.0f, 4.0f, 4.0f); return 0; }
+
+int test_cpu_vec4_negation() {
+    cpu_vec_4 v(1.0f, -2.0f, 3.0f, -4.0f);
+    cpu_vec_4 r = -v;
+    ASSERT_CPU_VEC4_EQ(r, -1.0f, 2.0f, -3.0f, 4.0f); return 0; }
+
+int test_cpu_vec4_component_mul() {
+    cpu_vec_4 a(2.0f, 3.0f, 4.0f, 5.0f), b(3.0f, 4.0f, 5.0f, 6.0f);
+    cpu_vec_4 r = a * b;
+    ASSERT_CPU_VEC4_EQ(r, 6.0f, 12.0f, 20.0f, 30.0f); return 0; }
+
+int test_cpu_vec4_scalar_mul() {
+    cpu_vec_4 v(1.0f, 2.0f, 3.0f, 4.0f);
+    cpu_vec_4 r = v * 2.0f;
+    ASSERT_CPU_VEC4_EQ(r, 2.0f, 4.0f, 6.0f, 8.0f); return 0; }
+
+int test_cpu_vec4_dot() {
+    cpu_vec_4 a(1.0f, 2.0f, 3.0f, 4.0f), b(5.0f, 6.0f, 7.0f, 8.0f);
+    ASSERT_FLOAT_EQ(a.dot(b), 70.0f); return 0; }  // 1*5+2*6+3*7+4*8=70
+
+int test_cpu_vec4_length() {
+    cpu_vec_4 v(2.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_FLOAT_EQ(v.length(), 2.0f);
+    cpu_vec_4 v2(1.0f, 1.0f, 1.0f, 1.0f);
+    ASSERT_FLOAT_EQ(v2.length(), 2.0f); return 0; }
+
+int test_cpu_vec4_normalized() {
+    cpu_vec_4 v(4.0f, 0.0f, 0.0f, 0.0f);
+    cpu_vec_4 r = v.normalized();
+    ASSERT_CPU_VEC4_EQ(r, 1.0f, 0.0f, 0.0f, 0.0f); return 0; }
+
+int test_cpu_vec4_normalize() {
+    cpu_vec_4 v(0.0f, 0.0f, 3.0f, 0.0f);
+    v.normalize();
+    ASSERT_CPU_VEC4_EQ(v, 0.0f, 0.0f, 1.0f, 0.0f); return 0; }
+
+int test_cpu_mat44_set_identity() {
+    cpu_mat_44 m; m.set_identity();
+    ASSERT_CPU_VEC4_EQ(m.get_col0(), 1.0f, 0.0f, 0.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(m.get_col1(), 0.0f, 1.0f, 0.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(m.get_col2(), 0.0f, 0.0f, 1.0f, 0.0f);
+    ASSERT_CPU_VEC4_EQ(m.get_col3(), 0.0f, 0.0f, 0.0f, 1.0f); return 0; }
+
+int test_cpu_mat44_col_access() {
+    cpu_mat_44 m;
+    m.set_col0(cpu_vec_xyzw(1.0f, 2.0f, 3.0f, 4.0f));
+    m.set_col1(cpu_vec_xyzw(5.0f, 6.0f, 7.0f, 8.0f));
+    m.set_col2(cpu_vec_xyzw(9.0f, 10.0f, 11.0f, 12.0f));
+    m.set_col3(cpu_vec_xyzw(13.0f, 14.0f, 15.0f, 16.0f));
+    ASSERT_CPU_VEC4_EQ(m.get_col0(), 1.0f, 2.0f, 3.0f, 4.0f);
+    ASSERT_CPU_VEC4_EQ(m.get_col1(), 5.0f, 6.0f, 7.0f, 8.0f);
+    ASSERT_CPU_VEC4_EQ(m.get_col2(), 9.0f, 10.0f, 11.0f, 12.0f);
+    ASSERT_CPU_VEC4_EQ(m.get_col3(), 13.0f, 14.0f, 15.0f, 16.0f); return 0; }
+
+int test_cpu_mat44_mult_vector_identity() {
+    cpu_mat_44 m; m.set_identity();
+    cpu_vec_4 v(1.0f, 2.0f, 3.0f, 4.0f);
+    cpu_vec_4 r = m * v;
+    ASSERT_CPU_VEC4_EQ(r, 1.0f, 2.0f, 3.0f, 4.0f); return 0; }
+
+int test_cpu_mat44_mult_vector() {
+    cpu_mat_44 m(cpu_vec_4(1.0f, 0.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 2.0f, 0.0f, 0.0f),
+                 cpu_vec_4(0.0f, 0.0f, 3.0f, 0.0f),
+                 cpu_vec_4(4.0f, 5.0f, 6.0f, 1.0f));
+    cpu_vec_4 v(1.0f, 1.0f, 1.0f, 1.0f);
+    cpu_vec_4 r = m * v;
+    ASSERT_CPU_VEC4_EQ(r, 5.0f, 7.0f, 9.0f, 1.0f); return 0; }
+
+int test_cpu_mat44_set_scale() {
+    cpu_mat_44 m; m.set_scale(cpu_vec_3(2.0f, 3.0f, 4.0f));
+    cpu_vec_4 r = m * cpu_vec_4(1.0f, 1.0f, 1.0f, 1.0f);
+    ASSERT_CPU_VEC4_EQ(r, 2.0f, 3.0f, 4.0f, 1.0f); return 0; }
+
+int test_cpu_mat44_set_translate() {
+    cpu_mat_44 m; m.set_translate(cpu_vec_3(5.0f, 6.0f, 7.0f));
+    cpu_vec_4 r = m * cpu_vec_4(1.0f, 2.0f, 3.0f, 1.0f);
+    ASSERT_CPU_VEC4_EQ(r, 6.0f, 8.0f, 10.0f, 1.0f); return 0; }
+
+int test_cpu_mat44_set_rotate_z() {
+    cpu_mat_44 m;
+    m.set_rotate(3.14159265358979f / 2.0f, cpu_vec_3(0.0f, 0.0f, 1.0f));
+    cpu_vec_4 r = m * cpu_vec_4(1.0f, 0.0f, 0.0f, 1.0f);
+    ASSERT_FLOAT_EQ(r.x, 0.0f); ASSERT_FLOAT_EQ(r.y, 1.0f); ASSERT_FLOAT_EQ(r.z, 0.0f); return 0; }
+
+int test_cpu_mat44_set_rotate_x() {
+    cpu_mat_44 m;
+    m.set_rotate(3.14159265358979f / 2.0f, cpu_vec_3(1.0f, 0.0f, 0.0f));
+    cpu_vec_4 r = m * cpu_vec_4(0.0f, 1.0f, 0.0f, 1.0f);
+    ASSERT_FLOAT_EQ(r.x, 0.0f); ASSERT_FLOAT_EQ(r.y, 0.0f); ASSERT_FLOAT_EQ(r.z, 1.0f); return 0; }
+
+int test_cpu_mat44_transpose() {
+    cpu_mat_44 m(cpu_vec_4(1.0f, 2.0f, 3.0f, 4.0f),
+                 cpu_vec_4(5.0f, 6.0f, 7.0f, 8.0f),
+                 cpu_vec_4(9.0f, 10.0f, 11.0f, 12.0f),
+                 cpu_vec_4(13.0f, 14.0f, 15.0f, 16.0f));
+    cpu_mat_44 t = m.transpose();
+    ASSERT_CPU_VEC4_EQ(t.get_col0(), 1.0f,  5.0f,  9.0f,  13.0f);
+    ASSERT_CPU_VEC4_EQ(t.get_col1(), 2.0f,  6.0f,  10.0f, 14.0f);
+    ASSERT_CPU_VEC4_EQ(t.get_col2(), 3.0f,  7.0f,  11.0f, 15.0f);
+    ASSERT_CPU_VEC4_EQ(t.get_col3(), 4.0f,  8.0f,  12.0f, 16.0f); return 0; }
+
+int test_cpu_mat44_multiply_mat() {
+    cpu_mat_44 a; a.set_scale(cpu_vec_3(2.0f, 3.0f, 4.0f));
+    cpu_mat_44 b; b.set_translate(cpu_vec_3(1.0f, 2.0f, 3.0f));
+    cpu_mat_44 r = a * b;
+    ASSERT_CPU_VEC4_EQ(r.get_col3(), 2.0f, 6.0f, 12.0f, 1.0f);
+    ASSERT_CPU_VEC4_EQ(r.get_col0(), 2.0f, 0.0f, 0.0f, 0.0f); return 0; }
+#endif // USE_CPU_COMPAT
+
+int main()
+{
+        printf("=== VU0 Function Unit Tests ===\n\n");
+
+        // Detect emulator/hardware VU multiply behavior and report it up front.
+        // This helps identify float corner-case differences (1*X vs X*1) noted
+        // in https://fobes.dev/ps2/detecting-emu-vu-floats
+        g_vu_mul_emulator_bug = isVUMulErrorPresent();
+        printf("VU mul 1*X behavior detection: %s\n\n",
+            g_vu_mul_emulator_bug ? "hardware-like (1*X != X)" : "emulator-like (1*X == X)");
+
+        int failed = 0;
+    
+    // Vector 3D tests
+    RUN_TEST(test_vec3_addition);
+    RUN_TEST(test_vec3_subtraction);
+    RUN_TEST(test_vec3_multiplication);
+    RUN_TEST(test_vec3_scalar_multiplication);
+    RUN_TEST(test_vec3_division);
+    RUN_TEST(test_vec3_negate);
+    RUN_TEST(test_vec3_abs);
+    RUN_TEST(test_vec3_max);
+    RUN_TEST(test_vec3_min);
+    RUN_TEST(test_vec3_normalized);
+    RUN_TEST(test_vec3_set_zero);
+    RUN_TEST(test_vec3_set);
+    RUN_TEST(test_vec3_set_vec);
+    RUN_TEST(test_vec3_operator_plus_equals);
+    RUN_TEST(test_vec3_operator_minus_equals);
+    RUN_TEST(test_vec3_operator_multiply_equals);
+    RUN_TEST(test_vec3_dot_product);
+    RUN_TEST(test_vec3_length);
+    RUN_TEST(test_vec3_length_sqr);
+    RUN_TEST(test_vec3_cross_product);
+    
+    // Vector 4D tests
+    RUN_TEST(test_vec4_addition);
+    RUN_TEST(test_vec4_subtraction);
+    RUN_TEST(test_vec4_scalar_multiplication);
+    
+    RUN_TEST(test_mat33_set_zero);
+    RUN_TEST(test_mat33_set_row);
+    RUN_TEST(test_mat33_get_row);
+    RUN_TEST(test_mat33_get_row1);
+    RUN_TEST(test_mat33_get_row2);
+    RUN_TEST(test_mat33_multiply_vector);
+    RUN_TEST(test_mat33_set_identity);
+    RUN_TEST(test_mat33_set_scale);
+    RUN_TEST(test_mat33_transpose);
+    
+    // Matrix 4x4 tests
+    RUN_TEST(test_mat44_set_zero);
+    RUN_TEST(test_mat44_set_row);
+    RUN_TEST(test_mat44_get_row1);
+    RUN_TEST(test_mat44_get_row2);
+    RUN_TEST(test_mat44_get_row3);
+    RUN_TEST(test_mat44_multiply_vector);
+    RUN_TEST(test_mat44_set_identity);
+    RUN_TEST(test_mat44_set_scale);
+    RUN_TEST(test_mat44_set_translate);
+    RUN_TEST(test_mat44_transpose);
+    
+    // Vector 4D additional tests
+    RUN_TEST(test_vec4_normalized3);
+    RUN_TEST(test_vec4_normalize3);
+    
+    // Vector 3D additional tests
+    RUN_TEST(test_vec3_normalized_zero);
+    RUN_TEST(test_vec3_truncate_length_shorter);
+    RUN_TEST(test_vec3_truncate_length_longer);
+    RUN_TEST(test_vec3_is_zero);
+    RUN_TEST(test_vec3_sign);
+    RUN_TEST(test_vec3_one_over);
+    RUN_TEST(test_vec3_interpolate);
+    RUN_TEST(test_vec3_distance_from);
+    RUN_TEST(test_vec3_set_length);
+    RUN_TEST(test_vec3_parallel_component);
+    RUN_TEST(test_vec3_perpendicular_component);
+    RUN_TEST(test_vec3_normalize);
+
+#ifndef USE_CPU_COMPAT
+    // Accumulator tests
+    RUN_TEST(test_accumulator_operations);
+    RUN_TEST(test_vec3_to_a_from_a);
+    RUN_TEST(test_vec3_aadd);
+    RUN_TEST(test_vec3_asub);
+    RUN_TEST(test_vec3_madd);
+    RUN_TEST(test_vec3_msub);
+    RUN_TEST(test_vec3_madda);
+    RUN_TEST(test_vec3_msuba);
+    RUN_TEST(test_vec3_aadda);
+    RUN_TEST(test_vec3_asuba);
+    
+    // VU0 copy function tests
+    RUN_TEST(test_vu0_copy_qwords);
+    RUN_TEST(test_vu0_copy_even_qwords);
+#endif // !USE_CPU_COMPAT
+
+    // mat_43 tests
+    RUN_TEST(test_mat43_set_zero);
+    RUN_TEST(test_mat43_get_rows);
+    RUN_TEST(test_mat43_multiply_vector);
+    RUN_TEST(test_mat43_transpose);
+    RUN_TEST(test_mat43_trans_mult_vec4);
+    RUN_TEST(test_mat43_trans_mult_vector_t);
+    RUN_TEST(test_mat43_trans_mult_point_t);
+
+    // mat_34 tests
+    RUN_TEST(test_mat34_set_zero);
+    RUN_TEST(test_mat34_get_rows);
+    RUN_TEST(test_mat34_multiply_vector);
+    RUN_TEST(test_mat34_transpose);
+    RUN_TEST(test_mat34_trans_mult_vec3);
+
+    // Matrix-matrix multiply tests
+    RUN_TEST(test_mat33_multiply_mat33);
+    RUN_TEST(test_mat33_multiply_mat33_nontrivial);
+    RUN_TEST(test_mat44_multiply_mat44);
+
+    // Row-vector * matrix tests
+    RUN_TEST(test_vec3_multiply_mat33);
+    RUN_TEST(test_vec3_multiply_mat34);
+    RUN_TEST(test_vec4_multiply_mat43);
+    RUN_TEST(test_vec4_multiply_mat44);
+
+    // mat_44 trans_mult tests
+    RUN_TEST(test_mat44_trans_mult_vec4);
+    RUN_TEST(test_mat44_trans_mult_vector_t);
+    RUN_TEST(test_mat44_trans_mult_point_t);
+
+    // Matrix negate tests
+    RUN_TEST(test_mat33_negate);
+    RUN_TEST(test_mat44_negate);
+    RUN_TEST(test_mat43_negate);
+    RUN_TEST(test_mat34_negate);
+
+    // mat_44 rotation tests
+    RUN_TEST(test_mat44_set_rotate_x);
+    RUN_TEST(test_mat44_set_rotate_z);
+    RUN_TEST(test_mat44_set_rotate_y);
+
+    // mat_33 rotation tests
+    RUN_TEST(test_mat33_set_rotate_x);
+    RUN_TEST(test_mat33_set_rotate_y);
+    RUN_TEST(test_mat33_set_rotate_z);
+
+    // mat_33 trans_mult and arithmetic
+    RUN_TEST(test_mat33_trans_mult_vec3);
+    RUN_TEST(test_mat33_trans_mult_mat33);
+    RUN_TEST(test_mat33_add);
+    RUN_TEST(test_mat33_subtract);
+
+    // mat_33 quaternion constructor and mult_tilde
+    RUN_TEST(test_mat33_from_quat);
+    RUN_TEST(test_mat33_mult_tilde);
+
+    // mat_44 arithmetic and products
+    RUN_TEST(test_mat44_add);
+    RUN_TEST(test_mat44_subtract);
+    RUN_TEST(test_mat44_trans_mult_mat44);
+    RUN_TEST(test_mat44_multiply_mat43);
+
+    // mat_43 arithmetic and products
+    RUN_TEST(test_mat43_add);
+    RUN_TEST(test_mat43_subtract);
+    RUN_TEST(test_mat43_multiply_mat33);
+    RUN_TEST(test_mat43_mult_tilde);
+
+    // mat_34 arithmetic and products
+    RUN_TEST(test_mat34_add);
+    RUN_TEST(test_mat34_subtract);
+    RUN_TEST(test_mat34_multiply_mat43);
+    RUN_TEST(test_mat34_multiply_mat44);
+
+    // mat_33::inverse, vec_xyz tilde, transform_t operations
+    RUN_TEST(test_mat33_inverse);
+    RUN_TEST(test_vec_xyz_tilde);
+    RUN_TEST(test_transform_mult_vec4);
+    RUN_TEST(test_transform_mult_vector);
+    RUN_TEST(test_transform_mult_point);
+    RUN_TEST(test_transform_inverse);
+    RUN_TEST(test_transform_orthonormal_inverse);
+    RUN_TEST(test_transform_orthonormal_inverse_in_place);
+
+#ifndef USE_CPU_COMPAT
+    printf("\n--- cpu_vec_4 / cpu_mat_44 VU0 asm fast paths ---\n\n");
+    RUN_TEST(test_cpu_vec4_vu0_addition);
+    RUN_TEST(test_cpu_vec4_vu0_addition_signed);
+    RUN_TEST(test_cpu_vec4_vu0_subtraction);
+    RUN_TEST(test_cpu_vec4_vu0_negation);
+    RUN_TEST(test_cpu_vec4_vu0_negation_zero);
+    RUN_TEST(test_cpu_vec4_vu0_component_mul);
+    RUN_TEST(test_cpu_vec4_vu0_component_mul_signed);
+    RUN_TEST(test_cpu_mat44_vu0_mult_vector_identity);
+    RUN_TEST(test_cpu_mat44_vu0_mult_vector_diag);
+    RUN_TEST(test_cpu_mat44_vu0_mult_vector_affine);
+    RUN_TEST(test_cpu_mat44_vu0_mult_vector_general);
+    RUN_TEST(test_cpu_mat44_vu0_mult_mat_identity);
+    RUN_TEST(test_cpu_mat44_vu0_mult_mat_scale_translate);
+    RUN_TEST(test_cpu_mat44_vu0_mult_mat_general);
+#endif // !USE_CPU_COMPAT
+
+#ifdef USE_CPU_COMPAT
+    printf("\n--- CPU legacy types (cpu_vec_3, cpu_vec_4, cpu_mat_44) ---\n\n");
+    RUN_TEST(test_cpu_vec3_construction);
+    RUN_TEST(test_cpu_vec3_set);
+    RUN_TEST(test_cpu_vec3_addition);
+    RUN_TEST(test_cpu_vec3_subtraction);
+    RUN_TEST(test_cpu_vec3_negation);
+    RUN_TEST(test_cpu_vec3_scalar_mul);
+    RUN_TEST(test_cpu_vec3_dot);
+    RUN_TEST(test_cpu_vec3_cross);
+    RUN_TEST(test_cpu_vec3_cross_general);
+    RUN_TEST(test_cpu_vec3_normalized);
+    RUN_TEST(test_cpu_vec3_normalize);
+    RUN_TEST(test_cpu_vec3_index_operator);
+    RUN_TEST(test_cpu_vec4_construction);
+    RUN_TEST(test_cpu_vec4_set);
+    RUN_TEST(test_cpu_vec4_addition);
+    RUN_TEST(test_cpu_vec4_subtraction);
+    RUN_TEST(test_cpu_vec4_negation);
+    RUN_TEST(test_cpu_vec4_component_mul);
+    RUN_TEST(test_cpu_vec4_scalar_mul);
+    RUN_TEST(test_cpu_vec4_dot);
+    RUN_TEST(test_cpu_vec4_length);
+    RUN_TEST(test_cpu_vec4_normalized);
+    RUN_TEST(test_cpu_vec4_normalize);
+    RUN_TEST(test_cpu_mat44_set_identity);
+    RUN_TEST(test_cpu_mat44_col_access);
+    RUN_TEST(test_cpu_mat44_mult_vector_identity);
+    RUN_TEST(test_cpu_mat44_mult_vector);
+    RUN_TEST(test_cpu_mat44_set_scale);
+    RUN_TEST(test_cpu_mat44_set_translate);
+    RUN_TEST(test_cpu_mat44_set_rotate_z);
+    RUN_TEST(test_cpu_mat44_set_rotate_x);
+    RUN_TEST(test_cpu_mat44_transpose);
+    RUN_TEST(test_cpu_mat44_multiply_mat);
+#endif // USE_CPU_COMPAT
+
+    printf("=== All Tests Completed ===\n");
+    printf("Total tests run: %d\n", test_count);
+    
+    if (failed_count > 0) {
+        printf("\n=== FAILED TESTS ===\n");
+        printf("Total failed: %d\n", failed_count);
+        for (int i = 0; i < failed_count; i++) {
+            printf("  - %s\n", failed_tests[i]);
+        }
+        printf("\n");
+    } else {
+        printf("All tests passed!\n");
+    }
+
+    SleepThread();
+    
+    return (failed_count > 0) ? 1 : 0;
+}
+