diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cd4c4c..7766527 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ All notable changes to RigForge are documented here. The format is based on ## [Unreleased] +### Fixed +- **HugePage sizing is now NUMA-aware (1 GB pages) (#111).** RandomX fast mode keeps a NUMA-local copy of the + ~2080 MB dataset **per NUMA node**, but the reservation math multiplied the per-dataset 1 GB pages by the + **socket** count, not the NUMA-node count. On a single-socket, multi-NUMA CPU — e.g. an EPYC 7642 with 4 + NUMA nodes — `setup` reserved 3× 1 GB instead of 12×, so after a reboot three of four nodes lost 1 GB + backing and hashrate dropped hard. Sizing now scales the 1 GB reservation (and the pure-2 MB fallback) + by NUMA nodes, detected via `lscpu` then `/sys/devices/system/node`, falling back to the socket count. + 2 MB scratchpad sizing is per-thread total and unaffected. Verified on a 4-NUMA EPYC (now reserves 12). + ## [1.0.0] - 2026-06-13 First stable release. RigForge turns a fresh Ubuntu/Debian (or macOS) machine into a fully tuned diff --git a/docs/hardware.md b/docs/hardware.md index ca0be22..430fb2e 100644 --- a/docs/hardware.md +++ b/docs/hardware.md @@ -58,7 +58,7 @@ auto-detection and layers on a few defaults that make sense because the box is a | `cpu.rx` | `-1` (auto) | XMRig sizes the thread count to L3 cache (~2 MB/thread) from detected topology — correct on EPYC, Ryzen, Intel hybrid, and X3D (incl. dual-CCD parts) alike. | | `cpu.asm` | `auto` | XMRig picks the Ryzen / Intel / Bulldozer assembly path for the detected CPU. | | `randomx.wrmsr` | `true` | Auto-applies the correct per-family MSR preset (needs root + the `msr` module). | -| `randomx.numa` | `true` | A no-op on single-socket; spreads the dataset per node on multi-socket / EPYC. | +| `randomx.numa` | `true` | A no-op on single-NUMA machines; on multi-NUMA CPUs it gives each node its own dataset copy. Note a single-socket EPYC can still expose several NUMA nodes — so RigForge sizes the 1 GB HugePage reservation per NUMA node, not per socket. | | `cpu.yield` | `false` | Busy-wait for maximum hashrate (we own the whole machine). | | `cpu.priority` | `2` | Wins scheduling vs. background daemons (XMRig warns >2 can hang a desktop). | | `cpu.huge-pages` / `randomx.1gb-pages` | `true` (Linux) | The single biggest lever; see below. | diff --git a/tests/e2e-real.sh b/tests/e2e-real.sh index 888298e..3fd229b 100755 --- a/tests/e2e-real.sh +++ b/tests/e2e-real.sh @@ -426,7 +426,10 @@ verify() { # RIGFORGE_OPERATOR for the re-own to hand files back to the operator (not root). Assert the unit # carries it, then exercise the re-own exactly the way the root timer does (no SUDO_USER + that operator). local wr="$HERE/data/worker" op="" - op=$(systemctl cat rigforge-autotune.service 2>/dev/null | sed -nE 's/^Environment=RIGFORGE_OPERATOR=//p' | head -1) + # `|| true`: when autotune is disabled the unit doesn't exist, so `systemctl cat` exits non-zero — + # which, under this script's `set -Eeuo pipefail`, would abort the whole gate before the SKIP branch + # below. Swallow it so a worker with autotune off cleanly reaches the skip. + op=$(systemctl cat rigforge-autotune.service 2>/dev/null | sed -nE 's/^Environment=RIGFORGE_OPERATOR=//p' | head -1 || true) if [ -n "$op" ]; then ok "autotune unit bakes in RIGFORGE_OPERATOR=$op (#reown)" if [ -d "$wr" ]; then diff --git a/tests/run.sh b/tests/run.sh index 1e600af..de261c8 100644 --- a/tests/run.sh +++ b/tests/run.sh @@ -96,6 +96,9 @@ EOF echo "Model name: ${STUB_CPU_MODEL:-Generic CPU}" echo "L3 cache: ${STUB_L3:-8 MiB}" echo "Socket(s): ${STUB_SOCKETS:-1}" +# NUMA nodes can exceed sockets (NPS / L3-as-NUMA on EPYC); default to the socket count so existing +# single-value tests are unchanged, and let STUB_NUMA_NODES drive the multi-NUMA cases. +echo "NUMA node(s): ${STUB_NUMA_NODES:-${STUB_SOCKETS:-1}}" # Modern lscpu (as root) also prints a DMI-derived BIOS line; the model parse must NOT pick this up. echo "BIOS Model name: ${STUB_CPU_MODEL:-Generic CPU} Unknown CPU @ 4.2GHz" EOF @@ -673,6 +676,33 @@ assert_eq "grub --runtime: RX_THREADS fallback (#65)" "$out" "1242" out="$(PATH="$STUBS:$PATH" STUB_L3="32 MiB" STUB_SOCKETS=1 RX_THREADS=0 HUGEPAGES_1G_NR="$SANDBOX/nr_4" bash "$PG" --runtime)" assert_eq "grub --runtime: RX_THREADS=0 falls back to L3 (#65)" "$out" "154" +# --- NUMA-aware 1G sizing: RandomX keeps a NUMA-LOCAL dataset copy per node, so 1G pages scale with NUMA +# nodes, NOT sockets. A single-socket EPYC with 4 NUMA nodes needs 12 (3*4), not 3 — the bug that starved +# 3 of 4 nodes after a reboot. (256 MiB L3 -> threads 128 -> 2M scratchpads 128+128+10 = 266.) +out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" +assert_contains "grub: 1G scales with NUMA nodes not sockets (1S/4N -> 12)" "$out" "hugepagesz=1G hugepages=12" +assert_contains "grub: 2M scratchpads are per-thread total, not NUMA-multiplied" "$out" "hugepagesz=2M hugepages=266" +# Verbose mode reports the NUMA node count it sized against (distinct from sockets). +out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG")" +assert_contains "grub: verbose reports NUMA node count" "$out" "NUMA Nodes: 4" +assert_contains "grub: verbose still reports sockets separately" "$out" "CPU Sockets: 1" +# The pure-2M fallback (no pdpe1gb) also holds a dataset copy per node: 1168*4 + 128 + 50 = 4850. +out="$(PATH="$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 STUB_NUMA_NODES=4 CPUINFO="$SANDBOX/cpuinfo_no1g" bash "$PG" -q)" +assert_contains "grub: 2M fallback dataset scales per NUMA node (1168*4+...)" "$out" "hugepages=4850" +# Detection fallbacks when lscpu lacks a "NUMA node(s)" line: count sysfs nodes, then sockets, then 1. +mkdir -p "$SANDBOX/nonuma" "$SANDBOX/numa4/node0" "$SANDBOX/numa4/node1" "$SANDBOX/numa4/node2" "$SANDBOX/numa4/node3" "$SANDBOX/numa_empty" +cat >"$SANDBOX/nonuma/lscpu" <<'EOF' +#!/usr/bin/env bash +echo "Model name: EPYC test" +echo "L3 cache: ${STUB_L3:-256 MiB}" +echo "Socket(s): ${STUB_SOCKETS:-1}" +EOF +chmod +x "$SANDBOX/nonuma/lscpu" +out="$(PATH="$SANDBOX/nonuma:$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=1 NODE_SYS="$SANDBOX/numa4" CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" +assert_contains "grub: NUMA from sysfs node count when lscpu silent (4 -> 12)" "$out" "hugepagesz=1G hugepages=12" +out="$(PATH="$SANDBOX/nonuma:$STUBS:$PATH" STUB_L3="256 MiB" STUB_SOCKETS=2 NODE_SYS="$SANDBOX/numa_empty" CPUINFO="$SANDBOX/cpuinfo_1g" bash "$PG" -q)" +assert_contains "grub: NUMA falls back to sockets when undetectable (2 -> 6)" "$out" "hugepagesz=1G hugepages=6" + # --------------------------------------------------------------------------- # tune_kernel must MERGE its HugePage/MSR params into the existing GRUB cmdline, not overwrite it # wholesale (#19 — overwriting drops other kernel params; a boot-safety risk). diff --git a/util/proposed-grub.sh b/util/proposed-grub.sh index 2365fdb..3a02d6e 100755 --- a/util/proposed-grub.sh +++ b/util/proposed-grub.sh @@ -39,12 +39,26 @@ if [[ -z "$L3_MB" ]]; then L3_MB=4 fi -# Detect Physical CPU Sockets (NUMA Nodes) +# Detect Physical CPU Sockets (for display / NUMA fallback). SOCKETS=$(lscpu | grep "Socket(s):" | awk '{print $2}') if [[ -z "$SOCKETS" ]]; then SOCKETS=1 fi +# Detect NUMA nodes. RandomX fast mode (with XMRig's numa=on) keeps a NUMA-LOCAL copy of the ~2080MB +# dataset PER NODE, so the 1GB-page reservation must scale with NUMA NODES, not sockets: a single-socket +# EPYC can expose 2/4/8 NUMA nodes (NPS / L3-as-NUMA), so counting sockets reserves only one node's worth +# and starves every other node of 1GB backing after a reboot (a large RandomX hashrate hit). Prefer +# lscpu's count, then count sysfs nodes, then fall back to the socket count, then 1. +NODE_SYS="${NODE_SYS:-/sys/devices/system/node}" +NUMA_NODES=$(lscpu 2>/dev/null | awk -F: '/^NUMA node\(s\):/ {gsub(/[^0-9]/, "", $2); print $2; exit}') +if ! { [ -n "$NUMA_NODES" ] && [ "$NUMA_NODES" -gt 0 ]; } 2>/dev/null; then + NUMA_NODES=$(find "$NODE_SYS" -maxdepth 1 -name 'node[0-9]*' 2>/dev/null | wc -l | tr -d ' ') +fi +if ! { [ -n "$NUMA_NODES" ] && [ "$NUMA_NODES" -gt 0 ]; } 2>/dev/null; then + NUMA_NODES="$SOCKETS" +fi + # --- 2. Resource Calculation --- # RandomX Requirement: 2MB L3 Cache per mining thread. Callers can override this estimate with RX_THREADS @@ -56,16 +70,17 @@ else THREADS=$((L3_MB / 2)) fi -# 1GB HugePages: Reserve 3GB per socket for the RandomX dataset (~2080MB) + overhead -TOTAL_GB_PAGES=$((3 * SOCKETS)) +# 1GB HugePages: 3 per NUMA node — each node holds its own ~2080MB RandomX dataset copy (rounds up to 3GB). +TOTAL_GB_PAGES=$((3 * NUMA_NODES)) # 2MB HugePages: Reserve for JIT compiler and scratchpads (128 base + 1 per thread + buffer) TOTAL_2MB_PAGES=$((128 + THREADS + 10)) # Fallback Strategy (Pure 2MB): Covers Dataset (2080MB) + Overhead + JIT -# 1168 pages * 2MB = ~2336MB per socket (Provides ~250MB buffer for fragmentation) +# 1168 pages * 2MB = ~2336MB per NUMA node (Provides ~250MB buffer for fragmentation). Scales per node +# because, like the 1GB path, each NUMA node holds its own dataset copy. BASE_2MB_PAGES=1168 -TOTAL_2MB_FALLBACK=$(((BASE_2MB_PAGES * SOCKETS) + THREADS + 50)) +TOTAL_2MB_FALLBACK=$(((BASE_2MB_PAGES * NUMA_NODES) + THREADS + 50)) if [ "$RUNTIME" -eq 1 ]; then # Check if 1GB pages are already allocated @@ -101,6 +116,7 @@ else echo "--- Hardware Analysis ---" echo "L3 Cache: ${L3_MB} MB" echo "CPU Sockets: $SOCKETS" + echo "NUMA Nodes: $NUMA_NODES (1GB dataset reservation scales with this)" echo "Max Threads: $THREADS (Based on 2MB L3/thread)" echo "-------------------------" echo "Proposed GRUB Configuration:"