From f1d3b255ffe409254a92c8d0a530d74bb2a78ccd Mon Sep 17 00:00:00 2001
From: Vijit Singh <vijit.n.singh@gmail.com>
Date: Sat, 13 Jun 2026 14:25:10 -0500
Subject: [PATCH 1/4] fix(hugepages): size 1G reservation by NUMA nodes, not
 sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RandomX fast mode keeps a NUMA-local copy of the ~2080 MB dataset per NUMA node
(XMRig allocates one dataset per node). util/proposed-grub.sh multiplied the
per-dataset 1G page count (3) by the SOCKET count, but a single-socket EPYC 7642
exposes 4 NUMA nodes — so setup reserved 3x 1G instead of 12x. The boxes ran fine
on an older boot's reservation, but a fresh setup + reboot would leave 3 of 4
nodes without 1G backing and tank hashrate.

Detect NUMA nodes (lscpu "NUMA node(s)", then count /sys/devices/system/node,
then fall back to sockets, then 1) and scale both the 1G reservation and the
pure-2M fallback by it. 2M scratchpad sizing is per-thread total and unchanged.

- proposed-grub.sh: NUMA_NODES detection + use it for TOTAL_GB_PAGES and
  TOTAL_2MB_FALLBACK; verbose output shows the NUMA node count.
- tests: stub lscpu now emits "NUMA node(s)" (defaults to socket count, so
  existing assertions are unchanged); added cases for the multi-NUMA 1G scaling,
  the 2M fallback scaling, and the sysfs/socket detection fallbacks.

Verified on a real 4-NUMA EPYC 7642: lscpu reports 4 nodes, calculator now emits
hugepages=1G hugepages=12 (was 3). Found while upgrading a fleet to v1.0.0; the
bug is in released v1.0.0 (and 0.1.0).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md          |  9 +++++++++
 tests/run.sh          | 26 ++++++++++++++++++++++++++
 util/proposed-grub.sh | 26 +++++++++++++++++++++-----
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2cd4c4c..77ddcb1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,15 @@ All notable changes to RigForge are documented here. The format is based on
 
 ## [Unreleased]
 
+### Fixed
+- **HugePage sizing is now NUMA-aware (1 GB pages).** RandomX fast mode keeps a NUMA-local copy of the
+  ~2080 MB dataset **per NUMA node**, but the reservation math multiplied the per-dataset 1 GB pages by the
+  **socket** count, not the NUMA-node count. On a single-socket, multi-NUMA CPU — e.g. an EPYC 7642 with 4
+  NUMA nodes — `setup` reserved 3× 1 GB instead of 12×, so after a reboot three of four nodes lost 1 GB
+  backing and hashrate dropped hard. Sizing now scales the 1 GB reservation (and the pure-2 MB fallback)
+  by NUMA nodes, detected via `lscpu` then `/sys/devices/system/node`, falling back to the socket count.
+  2 MB scratchpad sizing is per-thread total and unaffected. Verified on a 4-NUMA EPYC (now reserves 12).
+
 ## [1.0.0] - 2026-06-13
 
 First stable release. RigForge turns a fresh Ubuntu/Debian (or macOS) machine into a fully tuned
diff --git a/tests/run.sh b/tests/run.sh
index 1e600af..86c05e7 100644
--- a/tests/run.sh
+++ b/tests/run.sh
@@ -96,6 +96,9 @@ EOF
 echo "Model name:            ${STUB_CPU_MODEL:-Generic CPU}"
 echo "L3 cache:              ${STUB_L3:-8 MiB}"
 echo "Socket(s):             ${STUB_SOCKETS:-1}"
+# NUMA nodes can exceed sockets (NPS / L3-as-NUMA on EPYC); default to the socket count so existing
+# single-value tests are unchanged, and let STUB_NUMA_NODES drive the multi-NUMA cases.
+echo "NUMA node(s):          ${STUB_NUMA_NODES:-${STUB_SOCKETS:-1}}"
 # Modern lscpu (as root) also prints a DMI-derived BIOS line; the model parse must NOT pick this up.
 echo "BIOS Model name:       ${STUB_CPU_MODEL:-Generic CPU}            Unknown CPU @ 4.2GHz"
 EOF
@@ -673,6 +676,29 @@ assert_eq "grub --runtime: RX_THREADS fallback (#65)" "$out" "1242"
 out="$(PATH="$STUBS:$PATH" STUB_L3="32 MiB" STUB_SOCKETS=1 RX_THREADS=0 HUGEPAGES_1G_NR="$SANDBOX/nr_4" bash "$PG" --runtime)"
 assert_eq "grub --runtime: RX_THREADS=0 falls back to L3 (#65)" "$out" "154"
 
+# --- NUMA-aware 1G sizing: RandomX keeps a NUMA-LOCAL dataset copy per node, so 1G pages scale with NUMA
+# nodes, NOT sockets. A single-socket EPYC with 4 NUMA nodes needs 12 (3*4), not 3 — the bug that starved
+# 3 of 4 nodes after a reboot. (256 MiB L3 -> threads 128 -> 2M scratchpads 128+128+10 = 266.)
+out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)"
+assert_contains "grub: 1G scales with NUMA nodes not sockets (1S/4N -> 12)" "$out" "hugepagesz=1G hugepages=12"
+assert_contains "grub: 2M scratchpads are per-thread total, not NUMA-multiplied" "$out" "hugepagesz=2M hugepages=266"
+# The pure-2M fallback (no pdpe1gb) also holds a dataset copy per node: 1168*4 + 128 + 50 = 4850.
+out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_no1g" bash "$PG" -q)"
+assert_contains "grub: 2M fallback dataset scales per NUMA node (1168*4+...)" "$out" "hugepages=4850"
+# Detection fallbacks when lscpu lacks a "NUMA node(s)" line: count sysfs nodes, then sockets, then 1.
+mkdir -p "$SANDBOX/nonuma" "$SANDBOX/numa4/node0" "$SANDBOX/numa4/node1" "$SANDBOX/numa4/node2" "$SANDBOX/numa4/node3" "$SANDBOX/numa_empty"
+cat >"$SANDBOX/nonuma/lscpu" <<'EOF'
+#!/usr/bin/env bash
+echo "Model name:            EPYC test"
+echo "L3 cache:              ${STUB_L3:-256 MiB}"
+echo "Socket(s):             ${STUB_SOCKETS:-1}"
+EOF
+chmod +x "$SANDBOX/nonuma/lscpu"
+out="$(PATH="$SANDBOX/nonuma:$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 NODE_SYS="$SANDBOX/numa4" CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)"
+assert_contains "grub: NUMA from sysfs node count when lscpu silent (4 -> 12)" "$out" "hugepagesz=1G hugepages=12"
+out="$(PATH="$SANDBOX/nonuma:$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=2 NODE_SYS="$SANDBOX/numa_empty" CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)"
+assert_contains "grub: NUMA falls back to sockets when undetectable (2 -> 6)" "$out" "hugepagesz=1G hugepages=6"
+
 # ---------------------------------------------------------------------------
 # tune_kernel must MERGE its HugePage/MSR params into the existing GRUB cmdline, not overwrite it
 # wholesale (#19 — overwriting drops other kernel params; a boot-safety risk).
diff --git a/util/proposed-grub.sh b/util/proposed-grub.sh
index 2365fdb..3a02d6e 100755
--- a/util/proposed-grub.sh
+++ b/util/proposed-grub.sh
@@ -39,12 +39,26 @@ if [[ -z "$L3_MB" ]]; then
     L3_MB=4
 fi
 
-# Detect Physical CPU Sockets (NUMA Nodes)
+# Detect Physical CPU Sockets (for display / NUMA fallback).
 SOCKETS=$(lscpu | grep "Socket(s):" | awk '{print $2}')
 if [[ -z "$SOCKETS" ]]; then
     SOCKETS=1
 fi
 
+# Detect NUMA nodes. RandomX fast mode (with XMRig's numa=on) keeps a NUMA-LOCAL copy of the ~2080MB
+# dataset PER NODE, so the 1GB-page reservation must scale with NUMA NODES, not sockets: a single-socket
+# EPYC can expose 2/4/8 NUMA nodes (NPS / L3-as-NUMA), so counting sockets reserves only one node's worth
+# and starves every other node of 1GB backing after a reboot (a large RandomX hashrate hit). Prefer
+# lscpu's count, then count sysfs nodes, then fall back to the socket count, then 1.
+NODE_SYS="${NODE_SYS:-/sys/devices/system/node}"
+NUMA_NODES=$(lscpu 2>/dev/null | awk -F: '/^NUMA node\(s\):/ {gsub(/[^0-9]/, "", $2); print $2; exit}')
+if ! { [ -n "$NUMA_NODES" ] && [ "$NUMA_NODES" -gt 0 ]; } 2>/dev/null; then
+    NUMA_NODES=$(find "$NODE_SYS" -maxdepth 1 -name 'node[0-9]*' 2>/dev/null | wc -l | tr -d ' ')
+fi
+if ! { [ -n "$NUMA_NODES" ] && [ "$NUMA_NODES" -gt 0 ]; } 2>/dev/null; then
+    NUMA_NODES="$SOCKETS"
+fi
+
 # --- 2. Resource Calculation ---
 
 # RandomX Requirement: 2MB L3 Cache per mining thread. Callers can override this estimate with RX_THREADS
@@ -56,16 +70,17 @@ else
     THREADS=$((L3_MB / 2))
 fi
 
-# 1GB HugePages: Reserve 3GB per socket for the RandomX dataset (~2080MB) + overhead
-TOTAL_GB_PAGES=$((3 * SOCKETS))
+# 1GB HugePages: 3 per NUMA node — each node holds its own ~2080MB RandomX dataset copy (rounds up to 3GB).
+TOTAL_GB_PAGES=$((3 * NUMA_NODES))
 
 # 2MB HugePages: Reserve for JIT compiler and scratchpads (128 base + 1 per thread + buffer)
 TOTAL_2MB_PAGES=$((128 + THREADS + 10))
 
 # Fallback Strategy (Pure 2MB): Covers Dataset (2080MB) + Overhead + JIT
-# 1168 pages * 2MB = ~2336MB per socket (Provides ~250MB buffer for fragmentation)
+# 1168 pages * 2MB = ~2336MB per NUMA node (Provides ~250MB buffer for fragmentation). Scales per node
+# because, like the 1GB path, each NUMA node holds its own dataset copy.
 BASE_2MB_PAGES=1168
-TOTAL_2MB_FALLBACK=$(((BASE_2MB_PAGES * SOCKETS) + THREADS + 50))
+TOTAL_2MB_FALLBACK=$(((BASE_2MB_PAGES * NUMA_NODES) + THREADS + 50))
 
 if [ "$RUNTIME" -eq 1 ]; then
     # Check if 1GB pages are already allocated
@@ -101,6 +116,7 @@ else
     echo "--- Hardware Analysis ---"
     echo "L3 Cache:      ${L3_MB} MB"
     echo "CPU Sockets:   $SOCKETS"
+    echo "NUMA Nodes:    $NUMA_NODES (1GB dataset reservation scales with this)"
     echo "Max Threads:   $THREADS (Based on 2MB L3/thread)"
     echo "-------------------------"
     echo "Proposed GRUB Configuration:"

From 9c0a024e9f81da0ff929777f0cb3a20ed88036d1 Mon Sep 17 00:00:00 2001
From: Vijit Singh <vijit.n.singh@gmail.com>
Date: Sat, 13 Jun 2026 14:29:40 -0500
Subject: [PATCH 2/4] test: cover the verbose NUMA-node output line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

diff-cover flagged the new verbose "NUMA Nodes:" output line (proposed-grub.sh:119)
as uncovered — every other proposed-grub test runs with -q or --runtime. Add a
verbose-mode assertion exercising it (and the sockets line alongside it).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/run.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/run.sh b/tests/run.sh
index 86c05e7..de261c8 100644
--- a/tests/run.sh
+++ b/tests/run.sh
@@ -682,6 +682,10 @@ assert_eq "grub --runtime: RX_THREADS=0 falls back to L3 (#65)" "$out" "154"
 out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)"
 assert_contains "grub: 1G scales with NUMA nodes not sockets (1S/4N -> 12)" "$out" "hugepagesz=1G hugepages=12"
 assert_contains "grub: 2M scratchpads are per-thread total, not NUMA-multiplied" "$out" "hugepagesz=2M hugepages=266"
+# Verbose mode reports the NUMA node count it sized against (distinct from sockets).
+out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG")"
+assert_contains "grub: verbose reports NUMA node count" "$out" "NUMA Nodes:    4"
+assert_contains "grub: verbose still reports sockets separately" "$out" "CPU Sockets:   1"
 # The pure-2M fallback (no pdpe1gb) also holds a dataset copy per node: 1168*4 + 128 + 50 = 4850.
 out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_no1g" bash "$PG" -q)"
 assert_contains "grub: 2M fallback dataset scales per NUMA node (1168*4+...)" "$out" "hugepages=4850"

From 64a79821f7a681d4b8427b9e681bd29b707e029b Mon Sep 17 00:00:00 2001
From: Vijit Singh <vijit.n.singh@gmail.com>
Date: Sat, 13 Jun 2026 14:49:24 -0500
Subject: [PATCH 3/4] test(e2e-real): survive autotune-disabled in the #reown
 check; docs: NUMA sizing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups found while validating the NUMA fix with a clean install on a real
EPYC (autotune disabled in that worker's config):

- tests/e2e-real.sh: the #92 re-own check did `op=$(systemctl cat
  rigforge-autotune.service ...)`. When autotune is disabled the unit doesn't
  exist, systemctl exits non-zero, and under the gate's `set -Eeuo pipefail` the
  bare assignment aborted the whole verify phase right before the SKIP branch that
  was meant to handle exactly this. Add `|| true` so it reaches the skip. (Earlier
  gate runs all had autotune enabled, so this never surfaced.)
- docs/hardware.md: `randomx.numa` was described as "a no-op on single-socket",
  but a single-socket EPYC exposes several NUMA nodes — the misconception behind
  the 1G HugePage sizing bug. Clarify that the reservation scales per NUMA node.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/hardware.md  | 2 +-
 tests/e2e-real.sh | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/hardware.md b/docs/hardware.md
index ca0be22..430fb2e 100644
--- a/docs/hardware.md
+++ b/docs/hardware.md
@@ -58,7 +58,7 @@ auto-detection and layers on a few defaults that make sense because the box is a
 | `cpu.rx` | `-1` (auto) | XMRig sizes the thread count to L3 cache (~2 MB/thread) from detected topology — correct on EPYC, Ryzen, Intel hybrid, and X3D (incl. dual-CCD parts) alike. |
 | `cpu.asm` | `auto` | XMRig picks the Ryzen / Intel / Bulldozer assembly path for the detected CPU. |
 | `randomx.wrmsr` | `true` | Auto-applies the correct per-family MSR preset (needs root + the `msr` module). |
-| `randomx.numa` | `true` | A no-op on single-socket; spreads the dataset per node on multi-socket / EPYC. |
+| `randomx.numa` | `true` | A no-op on single-NUMA machines; on multi-NUMA CPUs it gives each node its own dataset copy. Note a single-socket EPYC can still expose several NUMA nodes — so RigForge sizes the 1 GB HugePage reservation per NUMA node, not per socket. |
 | `cpu.yield` | `false` | Busy-wait for maximum hashrate (we own the whole machine). |
 | `cpu.priority` | `2` | Wins scheduling vs. background daemons (XMRig warns >2 can hang a desktop). |
 | `cpu.huge-pages` / `randomx.1gb-pages` | `true` (Linux) | The single biggest lever; see below. |
diff --git a/tests/e2e-real.sh b/tests/e2e-real.sh
index 888298e..3fd229b 100755
--- a/tests/e2e-real.sh
+++ b/tests/e2e-real.sh
@@ -426,7 +426,10 @@ verify() {
     # RIGFORGE_OPERATOR for the re-own to hand files back to the operator (not root). Assert the unit
     # carries it, then exercise the re-own exactly the way the root timer does (no SUDO_USER + that operator).
     local wr="$HERE/data/worker" op=""
-    op=$(systemctl cat rigforge-autotune.service 2>/dev/null | sed -nE 's/^Environment=RIGFORGE_OPERATOR=//p' | head -1)
+    # `|| true`: when autotune is disabled the unit doesn't exist, so `systemctl cat` exits non-zero —
+    # which, under this script's `set -Eeuo pipefail`, would abort the whole gate before the SKIP branch
+    # below. Swallow it so a worker with autotune off cleanly reaches the skip.
+    op=$(systemctl cat rigforge-autotune.service 2>/dev/null | sed -nE 's/^Environment=RIGFORGE_OPERATOR=//p' | head -1 || true)
     if [ -n "$op" ]; then
         ok "autotune unit bakes in RIGFORGE_OPERATOR=$op (#reown)"
         if [ -d "$wr" ]; then

From d338d38f614e437ff2583d86d743b0a57c31b3ae Mon Sep 17 00:00:00 2001
From: Vijit Singh <vijit.n.singh@gmail.com>
Date: Sat, 13 Jun 2026 14:59:01 -0500
Subject: [PATCH 4/4] docs(changelog): reference #111 on the NUMA HugePage fix
 entry

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 77ddcb1..7766527 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ All notable changes to RigForge are documented here. The format is based on
 ## [Unreleased]
 
 ### Fixed
-- **HugePage sizing is now NUMA-aware (1 GB pages).** RandomX fast mode keeps a NUMA-local copy of the
+- **HugePage sizing is now NUMA-aware (1 GB pages) (#111).** RandomX fast mode keeps a NUMA-local copy of the
   ~2080 MB dataset **per NUMA node**, but the reservation math multiplied the per-dataset 1 GB pages by the
   **socket** count, not the NUMA-node count. On a single-socket, multi-NUMA CPU — e.g. an EPYC 7642 with 4
   NUMA nodes — `setup` reserved 3× 1 GB instead of 12×, so after a reboot three of four nodes lost 1 GB