Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion .github/scripts/prebuild-case-optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,44 @@ case "$cluster" in
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
esac

# Optional sharding (format "i/N", e.g. "1/2"), set by submit-slurm-job.sh's
# [shard] argument via $job_shard: shard i builds every Nth case of the sorted
# case list. Unset = build all cases in one job (default; other clusters).
shard="${job_shard:-}"
if [ -n "$shard" ]; then
# Validate full shape: must be exactly "digits/digits" — one slash with
# non-empty, purely numeric, non-leading-zero parts on both sides.
# Split first, then validate each part independently so that inputs like
# "1/" "/2" "//" "1/2/3" "a/b" "12" are all caught before any arithmetic.
shard_idx="${shard%%/*}"
shard_count="${shard##*/}"
# Reject if no slash (idx and count are equal and equal to the whole string)
case "$shard_idx" in
''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
esac
case "$shard_count" in
''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
esac
# Confirm the string is exactly "idx/count" — catches "12" (no slash) and
# "1/2/3" (extra slash, where idx=1 and count=2/3 would have failed above,
# but this is an extra safety net).
if [ "$shard" != "$shard_idx/$shard_count" ]; then
echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1
fi
if [ "$shard_idx" -lt 1 ] || [ "$shard_idx" -gt "$shard_count" ]; then
echo "ERROR: bad shard '$shard' (expected i/N with 1 <= i <= N)"; exit 1
fi
fi
Comment on lines +28 to +52

# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
# build.sh first, so we must preserve them and only clean MFC target staging.
# Sharded jobs share one workspace and run concurrently, so the workflow
# cleans once before submitting them — cleaning here would wipe a sibling
# shard's in-progress build.
if [ "$cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
else
elif [ -z "$shard" ]; then
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
fi
Expand All @@ -40,7 +72,49 @@ case "$job_interface" in
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
esac

# Case-optimized simulation builds land in per-case hash-named staging dirs,
# but syscheck/pre_process/post_process hash identically across these cases.
# Concurrent shards must not build those shared staging dirs simultaneously:
# shard 1 builds them first and drops a done marker; other shards wait for it,
# after which their builds no-op in the shared dirs.
if [ -n "$shard" ] && [ "$shard_count" -gt 1 ]; then
shared_marker_done="build/.prebuild-shared-targets-done"
shared_marker_failed="build/.prebuild-shared-targets-failed"
set -- benchmarks/*/case.py
first_case="$1"
if [ "$shard_idx" -eq 1 ]; then
# Remove both markers at the start so reruns and manual invocations
# never observe stale state from a prior run.
rm -f "$shared_marker_done" "$shared_marker_failed"
echo "=== Shard 1/$shard_count: building shared targets ==="
# Write the failure marker if the build exits non-zero so other shards
# can detect the failure immediately instead of waiting 90 minutes.
trap 'touch "$shared_marker_failed"' ERR
./mfc.sh build -i "$first_case" -t syscheck pre_process post_process --case-optimization $gpu_opts -j 8
trap - ERR
touch "$shared_marker_done"
else
echo "=== Shard $shard_idx/$shard_count: waiting for shard 1 to build shared targets ==="
waited=0
until [ -f "$shared_marker_done" ]; do
if [ -f "$shared_marker_failed" ]; then
echo "ERROR: shard 1 failed to build shared targets; see shard 1 log"; exit 1
fi
if [ "$waited" -ge 5400 ]; then
echo "ERROR: timed out waiting for $shared_marker_done"; exit 1
fi
sleep 30
waited=$((waited + 30))
done
Comment on lines +89 to +108
fi
fi
Comment on lines +80 to +110

idx=0
for case in benchmarks/*/case.py; do
idx=$((idx + 1))
if [ -n "$shard" ] && [ $(((idx - 1) % shard_count)) -ne $((shard_idx - 1)) ]; then
continue
fi
echo "=== Pre-building: $case ==="
./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
done
30 changes: 27 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,13 @@ jobs:
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'cpu'
interface: 'none'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'cpu'
interface: 'none'
shard: '2/2'
runs-on:
group: phoenix
labels: ${{ matrix.runner }}
Expand All @@ -420,7 +427,7 @@ jobs:

- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
timeout-minutes: 60
timeout-minutes: 120
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Build
Expand Down Expand Up @@ -523,7 +530,22 @@ jobs:

- name: Pre-Build (SLURM)
if: matrix.cluster == 'frontier_amd'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}
# AMD flang is slow enough that one serial pre-build job exceeds its
# walltime, so split the case list across two concurrent SLURM jobs.
# The shards share this workspace and skip their in-job staging clean,
# so clean once here on the login node before submitting.
run: |
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
rm -f build/.prebuild-shared-targets-done
bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 1/2 &
pid1=$!
bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 2/2 &
pid2=$!
rc=0
wait "$pid1" || rc=1
wait "$pid2" || rc=1
exit $rc

- name: Build & Run Case-Optimization Tests
if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
Expand All @@ -546,6 +568,8 @@ jobs:
if: always()
run: |
for f in prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out \
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-1-of-2.out \
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-2-of-2.out \
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out; do
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
done
Expand All @@ -556,5 +580,5 @@ jobs:
with:
name: case-opt-${{ strategy.job-index }}-${{ matrix.cluster }}-${{ matrix.interface }}
path: |
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}*.out
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
Loading