From f1d3b255ffe409254a92c8d0a530d74bb2a78ccd Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Sat, 13 Jun 2026 14:25:10 -0500 Subject: [PATCH 1/4] fix(hugepages): size 1G reservation by NUMA nodes, not sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RandomX fast mode keeps a NUMA-local copy of the ~2080 MB dataset per NUMA node (XMRig allocates one dataset per node). util/proposed-grub.sh multiplied the per-dataset 1G page count (3) by the SOCKET count, but a single-socket EPYC 7642 exposes 4 NUMA nodes — so setup reserved 3x 1G instead of 12x. The boxes ran fine on an older boot's reservation, but a fresh setup + reboot would leave 3 of 4 nodes without 1G backing and tank hashrate. Detect NUMA nodes (lscpu "NUMA node(s)", then count /sys/devices/system/node, then fall back to sockets, then 1) and scale both the 1G reservation and the pure-2M fallback by it. 2M scratchpad sizing is per-thread total and unchanged. - proposed-grub.sh: NUMA_NODES detection + use it for TOTAL_GB_PAGES and TOTAL_2MB_FALLBACK; verbose output shows the NUMA node count. - tests: stub lscpu now emits "NUMA node(s)" (defaults to socket count, so existing assertions are unchanged); added cases for the multi-NUMA 1G scaling, the 2M fallback scaling, and the sysfs/socket detection fallbacks. Verified on a real 4-NUMA EPYC 7642: lscpu reports 4 nodes, calculator now emits hugepages=1G hugepages=12 (was 3). Found while upgrading a fleet to v1.0.0; the bug is in released v1.0.0 (and 0.1.0). Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 9 +++++++++ tests/run.sh | 26 ++++++++++++++++++++++++++ util/proposed-grub.sh | 26 +++++++++++++++++++++----- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cd4c4c..77ddcb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ All notable changes to RigForge are documented here. The format is based on ## [Unreleased] +### Fixed +- **HugePage sizing is now NUMA-aware (1 GB pages).** RandomX fast mode keeps a NUMA-local copy of the + ~2080 MB dataset **per NUMA node**, but the reservation math multiplied the per-dataset 1 GB pages by the + **socket** count, not the NUMA-node count. On a single-socket, multi-NUMA CPU — e.g. an EPYC 7642 with 4 + NUMA nodes — `setup` reserved 3× 1 GB instead of 12×, so after a reboot three of four nodes lost 1 GB + backing and hashrate dropped hard. Sizing now scales the 1 GB reservation (and the pure-2 MB fallback) + by NUMA nodes, detected via `lscpu` then `/sys/devices/system/node`, falling back to the socket count. + 2 MB scratchpad sizing is per-thread total and unaffected. Verified on a 4-NUMA EPYC (now reserves 12). + ## [1.0.0] - 2026-06-13 First stable release. RigForge turns a fresh Ubuntu/Debian (or macOS) machine into a fully tuned diff --git a/tests/run.sh b/tests/run.sh index 1e600af..86c05e7 100644 --- a/tests/run.sh +++ b/tests/run.sh @@ -96,6 +96,9 @@ EOF echo "Model name: ${STUB_CPU_MODEL:-Generic CPU}" echo "L3 cache: ${STUB_L3:-8 MiB}" echo "Socket(s): ${STUB_SOCKETS:-1}" +# NUMA nodes can exceed sockets (NPS / L3-as-NUMA on EPYC); default to the socket count so existing +# single-value tests are unchanged, and let STUB_NUMA_NODES drive the multi-NUMA cases. +echo "NUMA node(s): ${STUB_NUMA_NODES:-${STUB_SOCKETS:-1}}" # Modern lscpu (as root) also prints a DMI-derived BIOS line; the model parse must NOT pick this up. echo "BIOS Model name: ${STUB_CPU_MODEL:-Generic CPU} Unknown CPU @ 4.2GHz" EOF @@ -673,6 +676,29 @@ assert_eq "grub --runtime: RX_THREADS fallback (#65)" "$out" "1242" out="$(PATH="$STUBS:$PATH" STUB_L3="32 MiB" STUB_SOCKETS=1 RX_THREADS=0 HUGEPAGES_1G_NR="$SANDBOX/nr_4" bash "$PG" --runtime)" assert_eq "grub --runtime: RX_THREADS=0 falls back to L3 (#65)" "$out" "154" +# --- NUMA-aware 1G sizing: RandomX keeps a NUMA-LOCAL dataset copy per node, so 1G pages scale with NUMA +# nodes, NOT sockets. A single-socket EPYC with 4 NUMA nodes needs 12 (3*4), not 3 — the bug that starved +# 3 of 4 nodes after a reboot. (256 MiB L3 -> threads 128 -> 2M scratchpads 128+128+10 = 266.) +out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" +assert_contains "grub: 1G scales with NUMA nodes not sockets (1S/4N -> 12)" "$out" "hugepagesz=1G hugepages=12" +assert_contains "grub: 2M scratchpads are per-thread total, not NUMA-multiplied" "$out" "hugepagesz=2M hugepages=266" +# The pure-2M fallback (no pdpe1gb) also holds a dataset copy per node: 1168*4 + 128 + 50 = 4850. +out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_no1g" bash "$PG" -q)" +assert_contains "grub: 2M fallback dataset scales per NUMA node (1168*4+...)" "$out" "hugepages=4850" +# Detection fallbacks when lscpu lacks a "NUMA node(s)" line: count sysfs nodes, then sockets, then 1. +mkdir -p "$SANDBOX/nonuma" "$SANDBOX/numa4/node0" "$SANDBOX/numa4/node1" "$SANDBOX/numa4/node2" "$SANDBOX/numa4/node3" "$SANDBOX/numa_empty" +cat >"$SANDBOX/nonuma/lscpu" <<'EOF' +#!/usr/bin/env bash +echo "Model name: EPYC test" +echo "L3 cache: ${STUB_L3:-256 MiB}" +echo "Socket(s): ${STUB_SOCKETS:-1}" +EOF +chmod +x "$SANDBOX/nonuma/lscpu" +out="$(PATH="$SANDBOX/nonuma:$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 NODE_SYS="$SANDBOX/numa4" CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" +assert_contains "grub: NUMA from sysfs node count when lscpu silent (4 -> 12)" "$out" "hugepagesz=1G hugepages=12" +out="$(PATH="$SANDBOX/nonuma:$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=2 NODE_SYS="$SANDBOX/numa_empty" CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" +assert_contains "grub: NUMA falls back to sockets when undetectable (2 -> 6)" "$out" "hugepagesz=1G hugepages=6" + # --------------------------------------------------------------------------- # tune_kernel must MERGE its HugePage/MSR params into the existing GRUB cmdline, not overwrite it # wholesale (#19 — overwriting drops other kernel params; a boot-safety risk). diff --git a/util/proposed-grub.sh b/util/proposed-grub.sh index 2365fdb..3a02d6e 100755 --- a/util/proposed-grub.sh +++ b/util/proposed-grub.sh @@ -39,12 +39,26 @@ if [[ -z "$L3_MB" ]]; then L3_MB=4 fi -# Detect Physical CPU Sockets (NUMA Nodes) +# Detect Physical CPU Sockets (for display / NUMA fallback). SOCKETS=$(lscpu | grep "Socket(s):" | awk '{print $2}') if [[ -z "$SOCKETS" ]]; then SOCKETS=1 fi +# Detect NUMA nodes. RandomX fast mode (with XMRig's numa=on) keeps a NUMA-LOCAL copy of the ~2080MB +# dataset PER NODE, so the 1GB-page reservation must scale with NUMA NODES, not sockets: a single-socket +# EPYC can expose 2/4/8 NUMA nodes (NPS / L3-as-NUMA), so counting sockets reserves only one node's worth +# and starves every other node of 1GB backing after a reboot (a large RandomX hashrate hit). Prefer +# lscpu's count, then count sysfs nodes, then fall back to the socket count, then 1. +NODE_SYS="${NODE_SYS:-/sys/devices/system/node}" +NUMA_NODES=$(lscpu 2>/dev/null | awk -F: '/^NUMA node\(s\):/ {gsub(/[^0-9]/, "", $2); print $2; exit}') +if ! { [ -n "$NUMA_NODES" ] && [ "$NUMA_NODES" -gt 0 ]; } 2>/dev/null; then + NUMA_NODES=$(find "$NODE_SYS" -maxdepth 1 -name 'node[0-9]*' 2>/dev/null | wc -l | tr -d ' ') +fi +if ! { [ -n "$NUMA_NODES" ] && [ "$NUMA_NODES" -gt 0 ]; } 2>/dev/null; then + NUMA_NODES="$SOCKETS" +fi + # --- 2. Resource Calculation --- # RandomX Requirement: 2MB L3 Cache per mining thread. Callers can override this estimate with RX_THREADS @@ -56,16 +70,17 @@ else THREADS=$((L3_MB / 2)) fi -# 1GB HugePages: Reserve 3GB per socket for the RandomX dataset (~2080MB) + overhead -TOTAL_GB_PAGES=$((3 * SOCKETS)) +# 1GB HugePages: 3 per NUMA node — each node holds its own ~2080MB RandomX dataset copy (rounds up to 3GB). +TOTAL_GB_PAGES=$((3 * NUMA_NODES)) # 2MB HugePages: Reserve for JIT compiler and scratchpads (128 base + 1 per thread + buffer) TOTAL_2MB_PAGES=$((128 + THREADS + 10)) # Fallback Strategy (Pure 2MB): Covers Dataset (2080MB) + Overhead + JIT -# 1168 pages * 2MB = ~2336MB per socket (Provides ~250MB buffer for fragmentation) +# 1168 pages * 2MB = ~2336MB per NUMA node (Provides ~250MB buffer for fragmentation). Scales per node +# because, like the 1GB path, each NUMA node holds its own dataset copy. BASE_2MB_PAGES=1168 -TOTAL_2MB_FALLBACK=$(((BASE_2MB_PAGES * SOCKETS) + THREADS + 50)) +TOTAL_2MB_FALLBACK=$(((BASE_2MB_PAGES * NUMA_NODES) + THREADS + 50)) if [ "$RUNTIME" -eq 1 ]; then # Check if 1GB pages are already allocated @@ -101,6 +116,7 @@ else echo "--- Hardware Analysis ---" echo "L3 Cache: ${L3_MB} MB" echo "CPU Sockets: $SOCKETS" + echo "NUMA Nodes: $NUMA_NODES (1GB dataset reservation scales with this)" echo "Max Threads: $THREADS (Based on 2MB L3/thread)" echo "-------------------------" echo "Proposed GRUB Configuration:" From 9c0a024e9f81da0ff929777f0cb3a20ed88036d1 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Sat, 13 Jun 2026 14:29:40 -0500 Subject: [PATCH 2/4] test: cover the verbose NUMA-node output line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit diff-cover flagged the new verbose "NUMA Nodes:" output line (proposed-grub.sh:119) as uncovered — every other proposed-grub test runs with -q or --runtime. Add a verbose-mode assertion exercising it (and the sockets line alongside it). Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/run.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/run.sh b/tests/run.sh index 86c05e7..de261c8 100644 --- a/tests/run.sh +++ b/tests/run.sh @@ -682,6 +682,10 @@ assert_eq "grub --runtime: RX_THREADS=0 falls back to L3 (#65)" "$out" "154" out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" assert_contains "grub: 1G scales with NUMA nodes not sockets (1S/4N -> 12)" "$out" "hugepagesz=1G hugepages=12" assert_contains "grub: 2M scratchpads are per-thread total, not NUMA-multiplied" "$out" "hugepagesz=2M hugepages=266" +# Verbose mode reports the NUMA node count it sized against (distinct from sockets). +out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG")" +assert_contains "grub: verbose reports NUMA node count" "$out" "NUMA Nodes: 4" +assert_contains "grub: verbose still reports sockets separately" "$out" "CPU Sockets: 1" # The pure-2M fallback (no pdpe1gb) also holds a dataset copy per node: 1168*4 + 128 + 50 = 4850. out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_no1g" bash "$PG" -q)" assert_contains "grub: 2M fallback dataset scales per NUMA node (1168*4+...)" "$out" "hugepages=4850" From 64a79821f7a681d4b8427b9e681bd29b707e029b Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Sat, 13 Jun 2026 14:49:24 -0500 Subject: [PATCH 3/4] test(e2e-real): survive autotune-disabled in the #reown check; docs: NUMA sizing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups found while validating the NUMA fix with a clean install on a real EPYC (autotune disabled in that worker's config): - tests/e2e-real.sh: the #92 re-own check did `op=$(systemctl cat rigforge-autotune.service ...)`. When autotune is disabled the unit doesn't exist, systemctl exits non-zero, and under the gate's `set -Eeuo pipefail` the bare assignment aborted the whole verify phase right before the SKIP branch that was meant to handle exactly this. Add `|| true` so it reaches the skip. (Earlier gate runs all had autotune enabled, so this never surfaced.) - docs/hardware.md: `randomx.numa` was described as "a no-op on single-socket", but a single-socket EPYC exposes several NUMA nodes — the misconception behind the 1G HugePage sizing bug. Clarify that the reservation scales per NUMA node. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/hardware.md | 2 +- tests/e2e-real.sh | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/hardware.md b/docs/hardware.md index ca0be22..430fb2e 100644 --- a/docs/hardware.md +++ b/docs/hardware.md @@ -58,7 +58,7 @@ auto-detection and layers on a few defaults that make sense because the box is a | `cpu.rx` | `-1` (auto) | XMRig sizes the thread count to L3 cache (~2 MB/thread) from detected topology — correct on EPYC, Ryzen, Intel hybrid, and X3D (incl. dual-CCD parts) alike. | | `cpu.asm` | `auto` | XMRig picks the Ryzen / Intel / Bulldozer assembly path for the detected CPU. | | `randomx.wrmsr` | `true` | Auto-applies the correct per-family MSR preset (needs root + the `msr` module). | -| `randomx.numa` | `true` | A no-op on single-socket; spreads the dataset per node on multi-socket / EPYC. | +| `randomx.numa` | `true` | A no-op on single-NUMA machines; on multi-NUMA CPUs it gives each node its own dataset copy. Note a single-socket EPYC can still expose several NUMA nodes — so RigForge sizes the 1 GB HugePage reservation per NUMA node, not per socket. | | `cpu.yield` | `false` | Busy-wait for maximum hashrate (we own the whole machine). | | `cpu.priority` | `2` | Wins scheduling vs. background daemons (XMRig warns >2 can hang a desktop). | | `cpu.huge-pages` / `randomx.1gb-pages` | `true` (Linux) | The single biggest lever; see below. | diff --git a/tests/e2e-real.sh b/tests/e2e-real.sh index 888298e..3fd229b 100755 --- a/tests/e2e-real.sh +++ b/tests/e2e-real.sh @@ -426,7 +426,10 @@ verify() { # RIGFORGE_OPERATOR for the re-own to hand files back to the operator (not root). Assert the unit # carries it, then exercise the re-own exactly the way the root timer does (no SUDO_USER + that operator). local wr="$HERE/data/worker" op="" - op=$(systemctl cat rigforge-autotune.service 2>/dev/null | sed -nE 's/^Environment=RIGFORGE_OPERATOR=//p' | head -1) + # `|| true`: when autotune is disabled the unit doesn't exist, so `systemctl cat` exits non-zero — + # which, under this script's `set -Eeuo pipefail`, would abort the whole gate before the SKIP branch + # below. Swallow it so a worker with autotune off cleanly reaches the skip. + op=$(systemctl cat rigforge-autotune.service 2>/dev/null | sed -nE 's/^Environment=RIGFORGE_OPERATOR=//p' | head -1 || true) if [ -n "$op" ]; then ok "autotune unit bakes in RIGFORGE_OPERATOR=$op (#reown)" if [ -d "$wr" ]; then From d338d38f614e437ff2583d86d743b0a57c31b3ae Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Sat, 13 Jun 2026 14:59:01 -0500 Subject: [PATCH 4/4] docs(changelog): reference #111 on the NUMA HugePage fix entry Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77ddcb1..7766527 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ All notable changes to RigForge are documented here. The format is based on ## [Unreleased] ### Fixed -- **HugePage sizing is now NUMA-aware (1 GB pages).** RandomX fast mode keeps a NUMA-local copy of the +- **HugePage sizing is now NUMA-aware (1 GB pages) (#111).** RandomX fast mode keeps a NUMA-local copy of the ~2080 MB dataset **per NUMA node**, but the reservation math multiplied the per-dataset 1 GB pages by the **socket** count, not the NUMA-node count. On a single-socket, multi-NUMA CPU — e.g. an EPYC 7642 with 4 NUMA nodes — `setup` reserved 3× 1 GB instead of 12×, so after a reboot three of four nodes lost 1 GB