From 9163a11e2cafae71f4be92ce3547bc8440c523f4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Fri, 6 Mar 2026 02:08:43 -0500
Subject: [PATCH 01/40] Fix Frontier benchmark SLURM: use batch+1:59+normal QOS

Benchmark jobs were using the extended partition (5:59 walltime, ENG160
account) causing multi-hour queue waits and hitting GHA's 8h wall-clock
limit. The actual benchmark runs in ~20 minutes on the node. Switch to
batch + 1:59 + --qos=normal (same as the test suite jobs).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml          | 2 +-
 .github/workflows/frontier/submit.sh | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b45fc45e40..5cf9681e33 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 480
+    timeout-minutes: 240
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index 16d4f0d73c..8b914db03e 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -45,10 +45,10 @@ fi
 
 # Select SBATCH params based on job type
 if [ "$job_type" = "bench" ]; then
-    sbatch_account="#SBATCH -A ENG160"
-    sbatch_time="#SBATCH -t 05:59:00"
-    sbatch_partition="#SBATCH -p extended"
-    sbatch_extra=""
+    sbatch_account="#SBATCH -A CFD154"
+    sbatch_time="#SBATCH -t 01:59:00"
+    sbatch_partition="#SBATCH -p batch"
+    sbatch_extra="#SBATCH --qos=normal"
 else
     sbatch_account="#SBATCH -A CFD154"
     sbatch_time="#SBATCH -t 01:59:00"

From ffe80ec2e01c5637955c0a21eb8c986ad7e2077c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Fri, 6 Mar 2026 02:13:47 -0500
Subject: [PATCH 02/40] Fix bench.yml: restore timeout-minutes to 480 (revert
 accidental 240)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 5cf9681e33..b45fc45e40 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 240
+    timeout-minutes: 480
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4

From cfbc02303fec44b63a51ed6a03f4853c8ce8be8b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 11:39:45 -0500
Subject: [PATCH 03/40] Remove persistent build cache for self-hosted test
 runners

Replace setup-build-cache.sh symlink mechanism with rm -rf build
before each test run on Phoenix and Frontier. Benchmark jobs unaffected.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/build.sh | 3 +--
 .github/workflows/phoenix/test.sh   | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 88446ad2a0..6abb0cff8a 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,9 +20,8 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-# Only set up build cache for test suite, not benchmarks
 if [ "$run_bench" != "bench" ]; then
-    source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
+    rm -rf build
 fi
 
 source .github/scripts/retry-build.sh
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index 6816bd9a25..c8a5af2132 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -3,8 +3,7 @@
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
-# Set up persistent build cache
-source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
+rm -rf build
 
 # Build with retry; smoke-test cached binaries to catch architecture mismatches
 # (SIGILL from binaries compiled on a different compute node).

From 574203046c0f324127979718ae1c4932c67c22fc Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 11:50:47 -0500
Subject: [PATCH 04/40] Remove build cache from benchmark jobs on Phoenix and
 Frontier

---
 .github/workflows/frontier/build.sh | 4 +---
 .github/workflows/phoenix/bench.sh  | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 6abb0cff8a..d21b1ddac4 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,9 +20,7 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-if [ "$run_bench" != "bench" ]; then
-    rm -rf build
-fi
+rm -rf build
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then
diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 0eafc485d1..e91ece366b 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -15,6 +15,8 @@ else
     bench_opts="--mem 1"
 fi
 
+rm -rf build
+
 source .github/scripts/retry-build.sh
 RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
 

From 7edb7c389e5fff6483ea45b5af7324b378ed60fa Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 14:10:18 -0500
Subject: [PATCH 05/40] Fix submit.sh to survive monitor SIGKILL by re-checking
 SLURM state

When the runner process is killed (exit 137) before the SLURM job
completes, sacct is used to verify the job's final state. If the
SLURM job completed with exit 0:0, the CI step passes regardless of
the monitor's exit code.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit.sh | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 5b7162fef7..c370ec5a3f 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -96,4 +96,20 @@ echo "Submitted batch job $job_id"
 
 # Use resilient monitoring instead of sbatch -W
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
+monitor_exit=0
+bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
+
+if [ "$monitor_exit" -ne 0 ]; then
+  echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
+  # Give the SLURM epilog time to finalize if the job just finished
+  sleep 30
+  final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
+  final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+  echo "Final SLURM state=$final_state exit=$final_exit"
+  if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
+    echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
+  else
+    echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
+    exit 1
+  fi
+fi

From 773f5adfcec5d9dc68b1fcff91d5eb0c492d6cfa Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 14:28:40 -0500
Subject: [PATCH 06/40] Extract monitor SIGKILL recovery into shared
 run_monitored_slurm_job.sh

All three submit.sh scripts (phoenix, frontier, frontier_amd symlink) now
call a single helper that wraps monitor_slurm_job.sh with sacct fallback:
if the monitor is killed before the SLURM job completes, the helper
re-checks the job's final state and exits 0 if it succeeded.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_monitored_slurm_job.sh | 37 ++++++++++++++++++++++
 .github/workflows/frontier/submit.sh       |  3 +-
 .github/workflows/phoenix/submit.sh        | 19 +----------
 3 files changed, 39 insertions(+), 20 deletions(-)
 create mode 100644 .github/scripts/run_monitored_slurm_job.sh

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
new file mode 100644
index 0000000000..905520c45e
--- /dev/null
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
+# from the runner OS) before the SLURM job completes.  When the monitor exits
+# non-zero, sacct is used to verify the job's actual final state; if the SLURM
+# job succeeded we exit 0 so the CI step is not falsely marked as failed.
+#
+# Usage: run_monitored_slurm_job.sh <job_id> <output_file>
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+monitor_exit=0
+bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
+
+if [ "$monitor_exit" -ne 0 ]; then
+    echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
+    # Give the SLURM epilog time to finalize if the job just finished
+    sleep 30
+    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
+    final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    echo "Final SLURM state=$final_state exit=$final_exit"
+    if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
+        echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
+    else
+        echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
+        exit 1
+    fi
+fi
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index 8b914db03e..4b472cd433 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -102,5 +102,4 @@ fi
 
 echo "Submitted batch job $job_id"
 
-# Use resilient monitoring instead of sbatch -W
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
+bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index c370ec5a3f..786489d1c4 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -94,22 +94,5 @@ fi
 
 echo "Submitted batch job $job_id"
 
-# Use resilient monitoring instead of sbatch -W
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-monitor_exit=0
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
-
-if [ "$monitor_exit" -ne 0 ]; then
-  echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
-  # Give the SLURM epilog time to finalize if the job just finished
-  sleep 30
-  final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
-  final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
-  echo "Final SLURM state=$final_state exit=$final_exit"
-  if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
-    echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
-  else
-    echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
-    exit 1
-  fi
-fi
+bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"

From 1311cbe4544ad75818f29e64ecec073248a20080 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 5 Mar 2026 05:03:58 -0500
Subject: [PATCH 07/40] Reduce benchmark steps and switch Frontier bench to
 batch/normal QOS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Cut benchmark time steps from 60-70 to 20 (GPU) / 10 (CPU) — still
  sufficient for grind time measurement
- Unify Frontier SLURM config: bench now uses CFD154/batch/normal like
  tests instead of ENG160/extended (2hr wall time vs 6hr)
- Reduce CI timeout from 8hr to 4hr

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml                   |  2 +-
 .github/workflows/frontier/submit.sh          | 15 ++++-----------
 benchmarks/5eq_rk3_weno3_hllc/case.py         |  4 ++--
 benchmarks/hypo_hll/case.py                   |  4 ++--
 benchmarks/ibm/case.py                        |  4 ++--
 benchmarks/igr/case.py                        |  4 ++--
 benchmarks/viscous_weno5_sgb_acoustic/case.py |  4 ++--
 7 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b45fc45e40..5cf9681e33 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 480
+    timeout-minutes: 240
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index 4b472cd433..c5dc8a41d3 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -44,17 +44,10 @@ else
 fi
 
 # Select SBATCH params based on job type
-if [ "$job_type" = "bench" ]; then
-    sbatch_account="#SBATCH -A CFD154"
-    sbatch_time="#SBATCH -t 01:59:00"
-    sbatch_partition="#SBATCH -p batch"
-    sbatch_extra="#SBATCH --qos=normal"
-else
-    sbatch_account="#SBATCH -A CFD154"
-    sbatch_time="#SBATCH -t 01:59:00"
-    sbatch_partition="#SBATCH -p batch"
-    sbatch_extra="#SBATCH --qos=normal"
-fi
+sbatch_account="#SBATCH -A CFD154"
+sbatch_time="#SBATCH -t 01:59:00"
+sbatch_partition="#SBATCH -p batch"
+sbatch_extra="#SBATCH --qos=normal"
 
 shard_suffix=""
 if [ -n "$4" ]; then
diff --git a/benchmarks/5eq_rk3_weno3_hllc/case.py b/benchmarks/5eq_rk3_weno3_hllc/case.py
index 5ecc327e8f..fa09426ffe 100644
--- a/benchmarks/5eq_rk3_weno3_hllc/case.py
+++ b/benchmarks/5eq_rk3_weno3_hllc/case.py
@@ -191,8 +191,8 @@
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 3,
             "model_eqns": 2,
diff --git a/benchmarks/hypo_hll/case.py b/benchmarks/hypo_hll/case.py
index 1663a507aa..f8d0928a01 100644
--- a/benchmarks/hypo_hll/case.py
+++ b/benchmarks/hypo_hll/case.py
@@ -44,8 +44,8 @@
             "p": Nz,
             "dt": 1e-8,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 2,
             "model_eqns": 2,
diff --git a/benchmarks/ibm/case.py b/benchmarks/ibm/case.py
index e16cb620b7..303cf7fcaf 100644
--- a/benchmarks/ibm/case.py
+++ b/benchmarks/ibm/case.py
@@ -48,8 +48,8 @@
             "p": Nz,
             "dt": mydt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 1,
             "model_eqns": 2,
diff --git a/benchmarks/igr/case.py b/benchmarks/igr/case.py
index 469bff1fa9..4ceed76257 100644
--- a/benchmarks/igr/case.py
+++ b/benchmarks/igr/case.py
@@ -63,8 +63,8 @@
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 1,
             "model_eqns": 2,
diff --git a/benchmarks/viscous_weno5_sgb_acoustic/case.py b/benchmarks/viscous_weno5_sgb_acoustic/case.py
index 9f1351b0c1..83bdc43e9c 100644
--- a/benchmarks/viscous_weno5_sgb_acoustic/case.py
+++ b/benchmarks/viscous_weno5_sgb_acoustic/case.py
@@ -94,8 +94,8 @@
             "p": Nz,
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 2,
             "model_eqns": 2,

From 644c9e4d27037011518fac5c22cd1d0794ed5c1c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 3 Mar 2026 17:04:20 -0500
Subject: [PATCH 08/40] Cap bench script parallelism at 64 to fix GNR node
 failures

On GNR nodes (192 cores), $(nproc) returns 192 which overwhelms
MPI daemons and causes SIGTERM (exit 143) during benchmarks.
Master lands on a 24-core node and passes while PR lands on GNR
and fails, making benchmarks appear broken by the PR.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/bench.sh | 5 ++++-
 .github/workflows/phoenix/bench.sh  | 8 ++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh
index b60f8541a2..b896feb17c 100644
--- a/.github/workflows/frontier/bench.sh
+++ b/.github/workflows/frontier/bench.sh
@@ -2,8 +2,11 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 if [ "$job_device" = "gpu" ]; then
     ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 else
-    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 fi
diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index e91ece366b..9a661cb924 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -2,6 +2,10 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes
+# (GNR nodes have 192 cores but nproc is too aggressive for build/bench).
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
 currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
 mkdir -p $tmpbuild
@@ -18,9 +22,9 @@ fi
 rm -rf build
 
 source .github/scripts/retry-build.sh
-RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
+RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 
-./mfc.sh bench $bench_opts -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
+./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 
 sleep 10
 rm -rf "$currentdir" || true

From a02f4b20497a47f4504f051ee28d8a084bb19564 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 2 Mar 2026 20:36:56 -0500
Subject: [PATCH 09/40] Disable AVX-512 FP16 to fix build on Granite Rapids
 nodes

gfortran 12+ with -march=native on Granite Rapids (GNR) CPUs emits
vmovw instructions (AVX-512 FP16) that binutils 2.35 cannot assemble,
causing LTO link failures. Add -mno-avx512fp16 when the compiler
supports it. FP16 is unused in MFC's double-precision computations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CMakeLists.txt | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddb3876724..3c5a80638f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,13 +224,24 @@ endif()
 
 if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # Processor tuning: Check if we can target the host's native CPU's ISA.
-    CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
-    if (SUPPORTS_MARCH_NATIVE)
-        add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
-    else()
-    	CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
-        if (SUPPORTS_MCPU_NATIVE)
-            add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
+    # Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids)
+    # can emit instructions the system assembler doesn't support.
+    if (NOT MFC_GCov)
+        CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
+        if (SUPPORTS_MARCH_NATIVE)
+            add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
+            # Disable AVX-512 FP16: gfortran ≥12 emits vmovw instructions on
+            # Granite Rapids CPUs, but binutils <2.38 cannot assemble them.
+            # FP16 is unused in MFC's double-precision computations.
+            CHECK_FORTRAN_COMPILER_FLAG("-mno-avx512fp16" SUPPORTS_MNO_AVX512FP16)
+            if (SUPPORTS_MNO_AVX512FP16)
+                add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mno-avx512fp16>)
+            endif()
+        else()
+            CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
+            if (SUPPORTS_MCPU_NATIVE)
+                add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
+            endif()
         endif()
     endif()
 

From ba91673f05785a1145f55d82af9758919b60fe23 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 2 Mar 2026 18:09:35 -0500
Subject: [PATCH 10/40] Fix Rich MarkupError crash when build output contains
 bracket paths

Build errors containing [/tmp/...] paths (e.g. LTO linker output) were
misinterpreted as Rich markup closing tags, crashing the error display
and masking the actual build failure. Wrap raw output in Text() to
prevent markup interpretation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/build.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 6430f7ad35..08ff6d7510 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -1,6 +1,7 @@
 import os, typing, hashlib, dataclasses, subprocess, re, time, sys, threading, queue
 
 from rich.panel import Panel
+from rich.text  import Text
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TaskProgressColumn
 
 from .case    import Case
@@ -273,14 +274,14 @@ def _show_build_error(result: subprocess.CompletedProcess, stage: str):
         stdout_text = result.stdout if isinstance(result.stdout, str) else result.stdout.decode('utf-8', errors='replace')
         stdout_text = stdout_text.strip()
         if stdout_text:
-            cons.raw.print(Panel(stdout_text, title="Output", border_style="yellow"))
+            cons.raw.print(Panel(Text(stdout_text), title="Output", border_style="yellow"))
 
     # Show stderr if available
     if result.stderr:
         stderr_text = result.stderr if isinstance(result.stderr, str) else result.stderr.decode('utf-8', errors='replace')
         stderr_text = stderr_text.strip()
         if stderr_text:
-            cons.raw.print(Panel(stderr_text, title="Errors", border_style="red"))
+            cons.raw.print(Panel(Text(stderr_text), title="Errors", border_style="red"))
 
     cons.print()
 

From 3e773fffd895174160cb7e02b272e93028f17740 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Fri, 6 Mar 2026 15:07:43 -0500
Subject: [PATCH 11/40] Address bot review comments: sacct -X flag, dead
 job_type var, stale comment

---
 .github/scripts/run_monitored_slurm_job.sh | 2 +-
 .github/workflows/frontier/submit.sh       | 7 -------
 .github/workflows/phoenix/test.sh          | 4 ++--
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
index 905520c45e..22141043ad 100644
--- a/.github/scripts/run_monitored_slurm_job.sh
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -26,7 +26,7 @@ if [ "$monitor_exit" -ne 0 ]; then
     # Give the SLURM epilog time to finalize if the job just finished
     sleep 30
     final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
-    final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
     echo "Final SLURM state=$final_state exit=$final_exit"
     if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
         echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index c5dc8a41d3..070a03094b 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -25,13 +25,6 @@ else
     exit 1
 fi
 
-# Detect job type from submitted script basename
-script_basename="$(basename "$1" .sh)"
-case "$script_basename" in
-    bench*) job_type="bench" ;;
-    *)      job_type="test"  ;;
-esac
-
 if [ "$2" = "cpu" ]; then
     sbatch_device_opts="\
 #SBATCH -n 32                       # Number of cores required"
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index c8a5af2132..3e8c9caa66 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -5,8 +5,8 @@ build_opts="$gpu_opts"
 
 rm -rf build
 
-# Build with retry; smoke-test cached binaries to catch architecture mismatches
-# (SIGILL from binaries compiled on a different compute node).
+# Build with retry; smoke-test the freshly built syscheck binary to catch
+# architecture mismatches (SIGILL from binaries compiled on a different compute node).
 source .github/scripts/retry-build.sh
 RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \
     retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

From fae2e6a08a2971d5f91e50e0063fca08a8f70b70 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 15:14:25 -0500
Subject: [PATCH 12/40] Fix bench: use PR's submit.sh for master job to get
 SIGKILL recovery

When benchmarking master vs PR, submit_and_monitor_bench.sh was using the
master directory's submit.sh for the master bench job. Master's submit.sh
calls monitor_slurm_job.sh directly without SIGKILL recovery. When the
monitor was killed (exit 137), the master bench YAML was never found.

Fix: always use the PR's submit.sh (which calls run_monitored_slurm_job.sh
with sacct fallback) for both master and PR bench submissions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/submit_and_monitor_bench.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
index c081c8692a..9eae6b9ff7 100755
--- a/.github/scripts/submit_and_monitor_bench.sh
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -17,9 +17,13 @@ cluster="$4"
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 
-# Submit and monitor job (submit.sh auto-detects bench mode from script name)
-bash .github/workflows/$cluster/submit.sh \
-    .github/workflows/$cluster/bench.sh "$device" "$interface"
+# Always use the PR's submit.sh so both master and PR builds benefit from the
+# run_monitored_slurm_job.sh SIGKILL recovery wrapper.  The bench script is
+# still resolved relative to the current directory (master/ or pr/) so the
+# correct branch code is benchmarked.  SLURM_SUBMIT_DIR ensures the job runs
+# in the right directory regardless of which submit.sh is invoked.
+PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
+bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"
 
 # Verify the YAML output file was created
 job_slug="bench-$device-$interface"

From 3224931537e141cee2c0c977e49bfa2307d6d4ab Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 15:21:10 -0500
Subject: [PATCH 13/40] Fix submit_and_monitor_bench.sh: define SCRIPT_DIR
 before use

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/submit_and_monitor_bench.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
index 9eae6b9ff7..e0a6eb7384 100755
--- a/.github/scripts/submit_and_monitor_bench.sh
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -14,6 +14,8 @@ device="$2"
 interface="$3"
 cluster="$4"
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 

From 2887def4d0c2fef2a1d202493120247063bc2e18 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 03:59:20 -0500
Subject: [PATCH 14/40] bench: update Phoenix tmpbuild path to project storage

---
 .github/workflows/phoenix/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 9a661cb924..218cf68a5f 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -6,7 +6,7 @@ source .github/scripts/bench-preamble.sh
 # (GNR nodes have 192 cores but nproc is too aggressive for build/bench).
 n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
 
-tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
+tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
 currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
 mkdir -p $tmpbuild
 mkdir -p $currentdir

From 1e4f984238247c03d406a4f77dd5bd85facfac6e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 12:57:06 -0500
Subject: [PATCH 15/40] =?UTF-8?q?Fix=20bench=20timeout=20(240=E2=86=92480)?=
 =?UTF-8?q?=20and=20monitor=20scancel=20defeating=20sacct=20recovery?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Restore bench.yml timeout-minutes to 480 (accidentally regressed to 240)
- Fix monitor_slurm_job.sh cleanup trap: check squeue before calling scancel
  so jobs that already left the queue are not cancelled, allowing
  run_monitored_slurm_job.sh to recover successfully via sacct

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 14 ++++++++++----
 .github/workflows/bench.yml          |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index ba7587ec70..0567a2ddb1 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -9,11 +9,17 @@ cleanup() {
   if [ -n "${tail_pid:-}" ]; then
     kill "${tail_pid}" 2>/dev/null || true
   fi
-  # Cancel the SLURM job if the monitor is exiting due to an error
-  # (e.g., the CI runner is being killed). Don't cancel on success.
+  # Cancel the SLURM job only if it is still active in the scheduler.
+  # If the job already left the queue (squeue returns empty), it has finished
+  # and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
   if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
-    echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
-    scancel "$job_id" 2>/dev/null || true
+    active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    if [ -n "$active_state" ]; then
+      echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
+      scancel "$job_id" 2>/dev/null || true
+    else
+      echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
+    fi
   fi
 }
 trap cleanup EXIT
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 5cf9681e33..b45fc45e40 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 240
+    timeout-minutes: 480
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4

From 5886f2ae3792ae4f6daf4e06273c3e9f14a933d6 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 13:05:51 -0500
Subject: [PATCH 16/40] Fix sacct empty-output edge case in
 run_monitored_slurm_job.sh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sacct can return empty output (zero exit) when accounting is not yet
recorded or the epilog hasn't finished — the previous '|| echo UNKNOWN'
only caught non-zero exits, leaving final_state=''. Use '|| true' to
suppress exit-on-error and ${var:-default} expansion to default to
UNKNOWN when the output is empty.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_monitored_slurm_job.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
index 22141043ad..6fb9e254ec 100644
--- a/.github/scripts/run_monitored_slurm_job.sh
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -25,8 +25,10 @@ if [ "$monitor_exit" -ne 0 ]; then
     echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
     # Give the SLURM epilog time to finalize if the job just finished
     sleep 30
-    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
-    final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
+    final_state="${final_state:-UNKNOWN}"
+    final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || true)
+    final_exit="${final_exit:-}"
     echo "Final SLURM state=$final_state exit=$final_exit"
     if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
         echo "SLURM job $job_id completed successfully despite monitor failure — continuing."

From 0551deabc34834aa9516d989549f016debd9b473 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 20:09:47 -0500
Subject: [PATCH 17/40] bench: dynamic Phoenix GPU partition, per-case logs,
 downgrade grind threshold to warning

- run_parallel_benchmarks.sh: before parallel launch, query sinfo for
  available GPU partitions (H200->H100->A100->L40S->RTX6000->V100)
  and export BENCH_GPU_PARTITION so both PR and master submit to the
  same GPU type; skip Blackwell (not on embers)
- phoenix/submit.sh: replace hardcoded -CL40S constraint with
  -p ${BENCH_GPU_PARTITION:-gpu-l40s} for bench GPU jobs
- bench.yml: add per-case failure and success log steps using .yaml
  presence to distinguish pass/fail per benchmark case
- bench.py: downgrade grind time regression check from error to warning

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_parallel_benchmarks.sh | 17 +++++++++++++++
 .github/workflows/bench.yml                | 24 ++++++++++++++++++++++
 .github/workflows/phoenix/submit.sh        |  3 ++-
 toolchain/mfc/bench.py                     |  3 +--
 4 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index be9b5c5a94..6d869059e9 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -20,6 +20,23 @@ echo "=========================================="
 echo "Starting parallel benchmark jobs..."
 echo "=========================================="
 
+# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
+# both parallel jobs so PR and master always land on the same GPU type.
+if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
+    echo "Selecting Phoenix GPU partition for benchmark consistency..."
+    BENCH_GPU_PARTITION=""
+    for part in gpu-h200 gpu-h100 gpu-a100 gpu-l40s gpu-rtx6000 gpu-v100; do
+        idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
+        if [ "${idle:-0}" -gt 0 ]; then
+            BENCH_GPU_PARTITION="$part"
+            break
+        fi
+    done
+    BENCH_GPU_PARTITION="${BENCH_GPU_PARTITION:-gpu-l40s}"
+    export BENCH_GPU_PARTITION
+    echo "Selected GPU partition: $BENCH_GPU_PARTITION"
+fi
+
 # Run both jobs with monitoring using dedicated script from PR
 # Use stdbuf for line-buffered output and prefix each line for clarity
 (set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b45fc45e40..18febfaacb 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -137,6 +137,30 @@ jobs:
           cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
           cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
 
+      - name: Print Per-Case Failure Logs
+        if: always()
+        run: |
+          for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
+            [ -f "$out" ] || continue
+            yaml="${out%.out}.yaml"
+            if [ ! -f "$yaml" ]; then
+              echo "=== [FAILED] $out ==="
+              cat "$out"
+            fi
+          done
+
+      - name: Print Per-Case Success Logs
+        if: always()
+        run: |
+          for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
+            [ -f "$out" ] || continue
+            yaml="${out%.out}.yaml"
+            if [ -f "$yaml" ]; then
+              echo "=== [PASSED] $out ==="
+              cat "$out"
+            fi
+          done
+
       # All other runners (non-Phoenix) just run without special env
       - name: Archive Logs (Frontier)
         if: always() && matrix.cluster != 'phoenix'
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 786489d1c4..71623ed494 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -30,8 +30,9 @@ sbatch_cpu_opts="\
 "
 
 if [ "$job_type" = "bench" ]; then
+    bench_partition="${BENCH_GPU_PARTITION:-gpu-l40s}"
     sbatch_gpu_opts="\
-#SBATCH -CL40S
+#SBATCH -p $bench_partition
 #SBATCH --ntasks-per-node=4       # Number of cores per node required
 #SBATCH -G2\
 "
diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
index 74f7469482..58b90e965b 100644
--- a/toolchain/mfc/bench.py
+++ b/toolchain/mfc/bench.py
@@ -228,8 +228,7 @@ def diff():
                     grind_time_value = lhs_summary[target.name]["grind"] / rhs_summary[target.name]["grind"]
                     speedups[i] += f" & Grind: {grind_time_value:.2f}"
                     if grind_time_value < 0.95:
-                        cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}")
-                        err = 1
+                        cons.print(f"[bold yellow]Warning[/bold yellow]: Grind time speedup for {target.name} below threshold (<0.95) - Case: {slug}")
             except Exception as e:
                 cons.print(
                     f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n"

From 16e0f7684e7c637e830b04b325b354c5370ba351 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 20:23:14 -0500
Subject: [PATCH 18/40] bench: address code review findings in GPU partition
 selection

- run_parallel_benchmarks.sh: add comment explaining || true on
  sinfo/grep pipeline; distinguish selected-by-availability vs
  fallback in log output; print last 50 lines of job log immediately
  on non-zero exit (not only when YAML is missing)
- phoenix/submit.sh: upgrade set -e to set -euo pipefail; quote $1
  in cat call; log resolved partition at submission time

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_parallel_benchmarks.sh | 13 +++++++++++--
 .github/workflows/phoenix/submit.sh        |  5 +++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index 6d869059e9..9b9d00369b 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -26,15 +26,20 @@ if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
     echo "Selecting Phoenix GPU partition for benchmark consistency..."
     BENCH_GPU_PARTITION=""
     for part in gpu-h200 gpu-h100 gpu-a100 gpu-l40s gpu-rtx6000 gpu-v100; do
+        # || true: grep -c exits 1 on zero matches (or when sinfo returns no output
+        # for an unknown partition); suppress so set -euo pipefail doesn't abort.
         idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
         if [ "${idle:-0}" -gt 0 ]; then
             BENCH_GPU_PARTITION="$part"
+            echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
             break
         fi
     done
-    BENCH_GPU_PARTITION="${BENCH_GPU_PARTITION:-gpu-l40s}"
+    if [ -z "$BENCH_GPU_PARTITION" ]; then
+        echo "WARNING: No idle GPU partition found; falling back to gpu-l40s (may queue)"
+        BENCH_GPU_PARTITION="gpu-l40s"
+    fi
     export BENCH_GPU_PARTITION
-    echo "Selected GPU partition: $BENCH_GPU_PARTITION"
 fi
 
 # Run both jobs with monitoring using dedicated script from PR
@@ -57,6 +62,8 @@ wait "$pr_pid"
 pr_exit=$?
 if [ "$pr_exit" -ne 0 ]; then
   echo "PR job exited with code: $pr_exit"
+  echo "Last 50 lines of PR job log:"
+  tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read PR log"
 else
   echo "PR job completed successfully"
 fi
@@ -65,6 +72,8 @@ wait "$master_pid"
 master_exit=$?
 if [ "$master_exit" -ne 0 ]; then
   echo "Master job exited with code: $master_exit"
+  echo "Last 50 lines of master job log:"
+  tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read master log"
 else
   echo "Master job completed successfully"
 fi
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 71623ed494..7e39b2a526 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -e
+set -euo pipefail
 
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
@@ -10,7 +10,7 @@ usage() {
 }
 
 if [ ! -z "$1" ]; then
-    sbatch_script_contents=`cat $1`
+    sbatch_script_contents=$(cat "$1")
 else
     usage
     exit 1
@@ -31,6 +31,7 @@ sbatch_cpu_opts="\
 
 if [ "$job_type" = "bench" ]; then
     bench_partition="${BENCH_GPU_PARTITION:-gpu-l40s}"
+    echo "Submitting bench GPU job to partition: $bench_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-<unset, using default>})"
     sbatch_gpu_opts="\
 #SBATCH -p $bench_partition
 #SBATCH --ntasks-per-node=4       # Number of cores per node required

From b396a1c762b6e766b4bb7aa6860e0db60200dda3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 21:32:25 -0500
Subject: [PATCH 19/40] ci: add gpu-h200 partition to Phoenix test and
 case-optimization GPU pool

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 7e39b2a526..be1ab34258 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -40,7 +40,7 @@ if [ "$job_type" = "bench" ]; then
     sbatch_time="#SBATCH -t 04:00:00"
 else
     sbatch_gpu_opts="\
-#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
+#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s,gpu-h200
 #SBATCH --ntasks-per-node=4       # Number of cores per node required
 #SBATCH -G2\
 "

From 7e5cabea5c0cf10d52e49acb7dfe75bd3d0407d7 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 21:56:40 -0500
Subject: [PATCH 20/40] ci: scancel orphaned SLURM jobs when GitHub Actions
 cancels the runner

Write SLURM job ID to <output>.slurm_job_id in run_monitored_slurm_job.sh
so a cancelled() step in test.yml and bench.yml can find and cancel any
in-flight SLURM jobs. This handles the SIGKILL case where the EXIT trap
in monitor_slurm_job.sh cannot fire.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_monitored_slurm_job.sh |  4 ++++
 .github/workflows/bench.yml                |  9 +++++++++
 .github/workflows/test.yml                 | 18 ++++++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
index 6fb9e254ec..d7c2e22704 100644
--- a/.github/scripts/run_monitored_slurm_job.sh
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -18,6 +18,10 @@ output_file="$2"
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
+# Write job ID next to the output file so the workflow cancel step can scancel it
+# if GitHub Actions terminates the runner (SIGKILL cannot be caught by trap).
+echo "$job_id" > "${output_file%.out}.slurm_job_id"
+
 monitor_exit=0
 bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
 
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 18febfaacb..0c4afe2d0f 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -125,6 +125,15 @@ jobs:
       - name: Bench (Master v. PR)
         run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
+      - name: Cancel SLURM Jobs
+        if: cancelled()
+        run: |
+          find . -name "*.slurm_job_id" | while read f; do
+            job_id=$(cat "$f")
+            echo "Cancelling SLURM job $job_id"
+            scancel "$job_id" 2>/dev/null || true
+          done
+
       - name: Generate & Post Comment
         if: always()
         run: |
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5dd072072d..08a166d51d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -258,6 +258,15 @@ jobs:
       - name: Test
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
 
+      - name: Cancel SLURM Jobs
+        if: cancelled()
+        run: |
+          find . -name "*.slurm_job_id" | while read f; do
+            job_id=$(cat "$f")
+            echo "Cancelling SLURM job $job_id"
+            scancel "$job_id" 2>/dev/null || true
+          done
+
       - name: Compute Log Slug
         if:   always()
         id:   log
@@ -340,6 +349,15 @@ jobs:
       - name: Run Case-Optimization Tests
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}
 
+      - name: Cancel SLURM Jobs
+        if: cancelled()
+        run: |
+          find . -name "*.slurm_job_id" | while read f; do
+            job_id=$(cat "$f")
+            echo "Cancelling SLURM job $job_id"
+            scancel "$job_id" 2>/dev/null || true
+          done
+
       - name: Print Logs
         if:   always()
         run:  |

From cf4f2a6574115d62eac7ca01f7f968dc8a3a4d04 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 23:18:42 -0500
Subject: [PATCH 21/40] Fix Phoenix CPU test: restore build cache to isolate
 concurrent jobs

Concurrent Phoenix jobs (cpu, gpu-acc, gpu-omp) all start simultaneously
on the same runner workspace. With 'rm -rf build', they race on
build/lock.yaml: the gpu-omp job writes gpu='mp', which the cpu test
command then reads, causing --mp-gpu in the cpu banner and a hipfort
cmake failure.

Restore setup-build-cache.sh to give each (device, interface, runner)
its own isolated build directory, preventing the race.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/test.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index 3e8c9caa66..a47fb0a441 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -3,7 +3,9 @@
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
-rm -rf build
+# Set up per-config isolated build directory to prevent lock.yaml races when
+# multiple jobs (cpu, gpu-acc, gpu-omp) run concurrently in the same workspace.
+source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
 
 # Build with retry; smoke-test the freshly built syscheck binary to catch
 # architecture mismatches (SIGILL from binaries compiled on a different compute node).

From 7abbce7491fb7cbe91eb123eff26a5503bd165cf Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 23:22:07 -0500
Subject: [PATCH 22/40] Revert "Fix Phoenix CPU test: restore build cache to
 isolate concurrent jobs"

This reverts commit cf4f2a6574115d62eac7ca01f7f968dc8a3a4d04.
---
 .github/workflows/phoenix/test.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index a47fb0a441..3e8c9caa66 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -3,9 +3,7 @@
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
-# Set up per-config isolated build directory to prevent lock.yaml races when
-# multiple jobs (cpu, gpu-acc, gpu-omp) run concurrently in the same workspace.
-source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
+rm -rf build
 
 # Build with retry; smoke-test the freshly built syscheck binary to catch
 # architecture mismatches (SIGILL from binaries compiled on a different compute node).

From df230114efe791633ec09d1a4f24aa432192bf21 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 23:25:00 -0500
Subject: [PATCH 23/40] Fix Phoenix test: pass explicit GPU flag to test
 command

The test command previously passed no --gpu/--no-gpu flag, so it always
read from build/lock.yaml. If lock.yaml was contaminated (stale from a
prior GPU run, NFS delays, or SLURM requeue race), the CPU test would
inherit gpu='mp' and attempt to build hipfort, causing a cmake failure.

Use ${build_opts:---no-gpu} so CPU jobs explicitly pass --no-gpu and
GPU jobs pass --gpu acc/mp. Lock.yaml content is now irrelevant.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index 3e8c9caa66..d073c54bde 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -19,4 +19,4 @@ if [ "$job_device" = "gpu" ]; then
     n_test_threads=$((ngpus * 2))
 fi
 
-./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
+./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts ${build_opts:---no-gpu} -- -c phoenix

From 8f586aee1e69018cec0c679e54fa53558ff1c6bf Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 23:31:30 -0500
Subject: [PATCH 24/40] ci: remove self-hosted runner build cache

- Delete setup-build-cache.sh (NFS symlink cache, no longer used)
- Remove dead _cache_nuke escalation from retry-build.sh
- Set clean: true on self-hosted runner checkouts so stale files
  (lock.yaml, .out, SLURM artifacts) cannot persist between runs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-build.sh       |  11 +--
 .github/scripts/setup-build-cache.sh | 101 ---------------------------
 .github/workflows/test.yml           |   4 +-
 3 files changed, 4 insertions(+), 112 deletions(-)
 delete mode 100755 .github/scripts/setup-build-cache.sh

diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh
index b82a2e5d8d..50b3a7f5c1 100755
--- a/.github/scripts/retry-build.sh
+++ b/.github/scripts/retry-build.sh
@@ -5,20 +5,13 @@
 # Usage: source .github/scripts/retry-build.sh
 #        retry_build ./mfc.sh build -j 8 --gpu acc
 
-# Try normal cleanup; if it fails, escalate to cache nuke.
 _retry_clean() {
     local clean_cmd="$1"
     if eval "$clean_cmd" 2>/dev/null; then
         return 0
     fi
-    echo "  Normal cleanup failed."
-    if type _cache_nuke > /dev/null 2>&1; then
-        echo "  Escalating to NFS cache nuke..."
-        _cache_nuke
-    else
-        echo "  _cache_nuke not available, best-effort rm."
-        rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
-    fi
+    echo "  Cleanup failed; falling back to best-effort rm."
+    rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
 }
 
 retry_build() {
diff --git a/.github/scripts/setup-build-cache.sh b/.github/scripts/setup-build-cache.sh
deleted file mode 100755
index 7e47175f6e..0000000000
--- a/.github/scripts/setup-build-cache.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-# Sets up a persistent build cache for self-hosted CI runners.
-# Creates a symlink: ./build -> <cache_root>/<key>/build
-#
-# Each runner gets its own cache keyed by (cluster, device, interface, runner).
-# This avoids cross-runner path issues entirely — CMake's absolute paths are
-# always correct because the same runner always uses the same workspace path.
-#
-# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>
-
-_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
-_cache_device="${2:?}"
-_cache_interface="${3:-none}"
-_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
-
-# Select cache root based on cluster (each HPC system has its own persistent storage).
-case "$_cache_cluster" in
-    phoenix)
-        _cache_root="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;;
-    frontier|frontier_amd)
-        _cache_root="/lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;;
-    *)
-        echo "=== Build Cache Setup ==="
-        echo "  No cache root configured for cluster '$_cache_cluster' — skipping."
-        echo "========================="
-        return 0 2>/dev/null || exit 0 ;;
-esac
-
-_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
-_cache_base="${_cache_root}/${_cache_key}/build"
-
-# Check if the cache directory is healthy (readable, writable, no stale handles).
-_cache_healthy() {
-    local dir="$1"
-    if ! ls "$dir" > /dev/null 2>&1; then
-        echo "  Health check FAILED: cannot list $dir"
-        return 1
-    fi
-    if [ -e "$dir/lock.yaml" ] && ! stat "$dir/lock.yaml" > /dev/null 2>&1; then
-        echo "  Health check FAILED: cannot stat $dir/lock.yaml"
-        return 1
-    fi
-    local probe="$dir/.nfs_probe.$$"
-    if ! touch "$probe" 2>/dev/null || ! rm -f "$probe" 2>/dev/null; then
-        echo "  Health check FAILED: cannot write/remove probe in $dir"
-        rm -f "$probe" 2>/dev/null
-        return 1
-    fi
-    return 0
-}
-
-# Nuclear recovery: rename stale cache out of the way and create a fresh one.
-# Uses mv (operates on parent directory entry) which works even when children
-# have stale file handles that prevent rm -rf from succeeding.
-_cache_nuke() {
-    local base="${1:-$_cache_base}"
-    local stale_name="${base}.stale.$(date +%s)"
-    echo "  NFS cache nuke: parking stale dir -> $stale_name"
-    if mv "$base" "$stale_name" 2>/dev/null; then
-        echo "  NFS cache nuke: renamed successfully"
-    else
-        echo "  NFS cache nuke: mv failed, trying rm -rf as fallback"
-        rm -rf "$base" 2>/dev/null || true
-    fi
-    mkdir -p "$base"
-    echo "  NFS cache nuke: fresh cache created at $base"
-}
-
-mkdir -p "$_cache_base"
-_cache_dir="$(cd "$_cache_base" && pwd -P)"
-
-echo "=== Build Cache Setup ==="
-echo "  Cache key: $_cache_key"
-echo "  Cache dir: $_cache_dir"
-
-# Pre-flight: detect stale NFS handles before wasting a build attempt.
-if ! _cache_healthy "$_cache_dir"; then
-    echo "  Stale NFS cache detected — nuking and recreating."
-    _cache_nuke "$_cache_base"
-    _cache_dir="$(cd "$_cache_base" && pwd -P)"
-fi
-
-# Replace any existing build/ (real dir or stale symlink) with a symlink
-# to our runner-specific cache directory.
-# Use unlink for symlinks to avoid rm -rf following the link and deleting
-# the shared cache contents (which another runner may be using).
-if [ -L "build" ]; then
-    unlink "build"
-elif [ -e "build" ]; then
-    rm -rf "build"
-fi
-
-ln -s "$_cache_dir" "build"
-
-echo "  Symlink: build -> $_cache_dir"
-
-# Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago.
-_cache_parent="$(dirname "$_cache_base")"
-find "$_cache_parent" -maxdepth 1 -name "*.stale.*" -mtime +7 -exec rm -rf {} + 2>/dev/null || true
-
-echo "========================="
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 08a166d51d..37e57c6bb6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -243,7 +243,7 @@ jobs:
       - name: Clone
         uses: actions/checkout@v4
         with:
-          clean: false
+          clean: true
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
@@ -330,7 +330,7 @@ jobs:
       - name: Clone
         uses: actions/checkout@v4
         with:
-          clean: false
+          clean: true
 
       - name: Pre-Build (SLURM)
         if:   matrix.cluster == 'phoenix'

From 24f25f39503f6cd9ba3c005cbd1848d550f11d24 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 23:36:13 -0500
Subject: [PATCH 25/40] ci: nuke entire build dir on attempt 3 of retry_build

Attempt 2 (after attempt 1 fails) does the existing partial clean.
Attempt 3 (after attempt 2 fails) nukes the entire build/ directory
so the final try is completely fresh.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-build.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh
index 50b3a7f5c1..6ab2e1fed0 100755
--- a/.github/scripts/retry-build.sh
+++ b/.github/scripts/retry-build.sh
@@ -7,6 +7,12 @@
 
 _retry_clean() {
     local clean_cmd="$1"
+    local attempt="${2:-1}"
+    if [ "$attempt" -ge 2 ]; then
+        echo "  Attempt $attempt failed — nuking entire build directory for a clean retry."
+        rm -rf build 2>/dev/null || true
+        return 0
+    fi
     if eval "$clean_cmd" 2>/dev/null; then
         return 0
     fi
@@ -26,8 +32,7 @@ retry_build() {
                 if ! eval "$validate_cmd"; then
                     echo "Post-build validation failed on attempt $attempt."
                     if [ $attempt -lt $max_attempts ]; then
-                        echo "Cleaning and retrying in 5s..."
-                        _retry_clean "$clean_cmd"
+                        _retry_clean "$clean_cmd" "$attempt"
                         sleep 5
                         attempt=$((attempt + 1))
                         continue
@@ -41,8 +46,7 @@ retry_build() {
             return 0
         fi
         if [ $attempt -lt $max_attempts ]; then
-            echo "Build failed on attempt $attempt. Retrying in 30s..."
-            _retry_clean "$clean_cmd"
+            _retry_clean "$clean_cmd" "$attempt"
             sleep 30
         else
             echo "Build failed after $max_attempts attempts."

From 0104233de13d2782cbe09bac1fd0e240e528ed3f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 10:07:02 -0400
Subject: [PATCH 26/40] ci: reduce to 2 attempts, nuke build dir on retry

Replace 3-attempt loops with 2 attempts everywhere. On the single
retry, nuke the entire build directory rather than partial cleanup.

Applies to retry_build() in retry-build.sh, nick-fields/retry in
test.yml (Build and Pre-Build steps), and bench.yml Setup & Build.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-build.sh | 28 +++++++---------------------
 .github/workflows/bench.yml    |  6 ++----
 .github/workflows/test.yml     |  8 ++++----
 3 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh
index 6ab2e1fed0..38ac08b217 100755
--- a/.github/scripts/retry-build.sh
+++ b/.github/scripts/retry-build.sh
@@ -1,29 +1,13 @@
 #!/bin/bash
-# Provides retry_build(): 3-attempt loop with configurable cleanup.
-# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
+# Provides retry_build(): 2-attempt loop.
+# On failure of attempt 1, nukes the entire build directory before attempt 2.
 # Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
 # Usage: source .github/scripts/retry-build.sh
 #        retry_build ./mfc.sh build -j 8 --gpu acc
 
-_retry_clean() {
-    local clean_cmd="$1"
-    local attempt="${2:-1}"
-    if [ "$attempt" -ge 2 ]; then
-        echo "  Attempt $attempt failed — nuking entire build directory for a clean retry."
-        rm -rf build 2>/dev/null || true
-        return 0
-    fi
-    if eval "$clean_cmd" 2>/dev/null; then
-        return 0
-    fi
-    echo "  Cleanup failed; falling back to best-effort rm."
-    rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
-}
-
 retry_build() {
-    local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
     local validate_cmd="${RETRY_VALIDATE_CMD:-}"
-    local max_attempts=3
+    local max_attempts=2
     local attempt=1
     while [ $attempt -le $max_attempts ]; do
         echo "Build attempt $attempt of $max_attempts..."
@@ -32,7 +16,8 @@ retry_build() {
                 if ! eval "$validate_cmd"; then
                     echo "Post-build validation failed on attempt $attempt."
                     if [ $attempt -lt $max_attempts ]; then
-                        _retry_clean "$clean_cmd" "$attempt"
+                        echo "  Nuking build directory before retry..."
+                        rm -rf build 2>/dev/null || true
                         sleep 5
                         attempt=$((attempt + 1))
                         continue
@@ -46,7 +31,8 @@ retry_build() {
             return 0
         fi
         if [ $attempt -lt $max_attempts ]; then
-            _retry_clean "$clean_cmd" "$attempt"
+            echo "  Build failed — nuking build directory before retry..."
+            rm -rf build 2>/dev/null || true
             sleep 30
         else
             echo "Build failed after $max_attempts attempts."
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 0c4afe2d0f..8eede84ae8 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -106,7 +106,7 @@ jobs:
         if: matrix.build_script != ''
         uses: nick-fields/retry@v3
         with:
-          max_attempts: 3
+          max_attempts: 2
           retry_wait_seconds: 60
           timeout_minutes: 150
           command: |
@@ -118,9 +118,7 @@ jobs:
             wait $pid2; e2=$?
             [ $e1 -eq 0 ] && [ $e2 -eq 0 ]
           on_retry_command: |
-            (cd pr     && ./mfc.sh clean) &
-            (cd master && ./mfc.sh clean) &
-            wait
+            rm -rf pr/build master/build
 
       - name: Bench (Master v. PR)
         run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 37e57c6bb6..2c395bf0e9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -249,11 +249,11 @@ jobs:
         if:   matrix.cluster != 'phoenix'
         uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
         with:
-          max_attempts: 3
+          max_attempts: 2
           retry_wait_seconds: 60
           timeout_minutes: 60
           command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
-          on_retry_command: ./mfc.sh clean
+          on_retry_command: rm -rf build
 
       - name: Test
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
@@ -340,11 +340,11 @@ jobs:
         if:   matrix.cluster != 'phoenix'
         uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
         with:
-          max_attempts: 3
+          max_attempts: 2
           retry_wait_seconds: 60
           timeout_minutes: 120
           command: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
-          on_retry_command: ./mfc.sh clean
+          on_retry_command: rm -rf build
 
       - name: Run Case-Optimization Tests
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}

From ffb43f77adcb10a5d4dd514f45da127ca0ccb3d7 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 10:39:25 -0400
Subject: [PATCH 27/40] ci: revert case-opt to clean: false to preserve SLURM
 build cache

The Pre-Build (SLURM) step on Phoenix rebuilds case-optimized binaries
using the runner workspace on NFS. With clean: true, all cached build
artifacts are wiped before each run, causing the SLURM job to build from
scratch and get preempted (~9 min) before completing.

The case-opt job uses explicit GPU opts (gpu-opts.sh) so it is not
affected by the lock.yaml contamination issue that motivated clean: true
in the self job.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2c395bf0e9..f517fb057d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -330,7 +330,7 @@ jobs:
       - name: Clone
         uses: actions/checkout@v4
         with:
-          clean: true
+          clean: false
 
       - name: Pre-Build (SLURM)
         if:   matrix.cluster == 'phoenix'

From fb6101deb0e20094fd6c09997091470e0990e076 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 13:01:37 -0400
Subject: [PATCH 28/40] ci: treat PREEMPTED as non-terminal so --requeue jobs
 keep being monitored

With #SBATCH --requeue, a preempted SLURM job restarts under the same job
ID (PREEMPTED -> PENDING -> RUNNING). Previously PREEMPTED was listed as a
terminal state, causing the monitor to exit immediately and report failure,
discarding the requeue.

Remove PREEMPTED from is_terminal_state() and add it to the PENDING branch
of the pre-output-file wait loop so the monitor keeps polling through the
preemption-requeue cycle and resumes streaming once the job restarts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 0567a2ddb1..1142e97057 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -62,9 +62,11 @@ get_job_state() {
 }
 
 # Check if a state is terminal (job is done, for better or worse)
+# PREEMPTED is intentionally excluded: with --requeue the job restarts under
+# the same job ID and we must keep monitoring rather than exiting early.
 is_terminal_state() {
   case "$1" in
-    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
       return 0 ;;
     *)
       return 1 ;;
@@ -80,7 +82,7 @@ while [ ! -f "$output_file" ]; do
   state=$(get_job_state "$job_id")
 
   case "$state" in
-    PENDING|CONFIGURING)
+    PENDING|CONFIGURING|PREEMPTED)
       unknown_count=0
       sleep 5
       ;;

From 68592d74cd7b41f7eddd65300c83a15fe952b7c0 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 13:36:11 -0400
Subject: [PATCH 29/40] ci: clean build dir before case-opt pre-build; drop
 retry

Stale build artifacts from a previous job with a different compiler
(e.g. gfortran CPU build followed by nvfortran GPU case-opt build)
cause linker failures: undefined references to _gfortran_* symbols.

Add rm -rf build at the start of prebuild-case-optimization.sh so every
pre-build starts on a clean slate regardless of what the runner workspace
contains. This fixes the liblapack.a cross-compiler link failure.

Also remove the nick-fields/retry wrapper from the Frontier login-node
pre-build step: one clean attempt is sufficient and retries were only
masking the root cause.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/prebuild-case-optimization.sh | 2 ++
 .github/workflows/test.yml                    | 8 +-------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh
index 87f26fdb5f..130f523c07 100755
--- a/.github/scripts/prebuild-case-optimization.sh
+++ b/.github/scripts/prebuild-case-optimization.sh
@@ -21,6 +21,8 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
+rm -rf build
+
 . ./mfc.sh load -c "$flag" -m g
 source .github/scripts/gpu-opts.sh
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f517fb057d..e6002f5e3b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -338,13 +338,7 @@ jobs:
 
       - name: Pre-Build (login node)
         if:   matrix.cluster != 'phoenix'
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
-        with:
-          max_attempts: 2
-          retry_wait_seconds: 60
-          timeout_minutes: 120
-          command: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
-          on_retry_command: rm -rf build
+        run:  bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Run Case-Optimization Tests
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}

From 0775fde26785e978e73b2350bc28f0e2df53b058 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 13:49:40 -0400
Subject: [PATCH 30/40] ci: remove dead RETRY_CLEAN_CMD from bench.sh

retry-build.sh no longer has a RETRY_CLEAN_CMD mechanism; the variable
was a no-op that misled readers into thinking it controlled retry cleanup.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 218cf68a5f..abaf76f33d 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -22,7 +22,7 @@ fi
 rm -rf build
 
 source .github/scripts/retry-build.sh
-RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
+retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 
 ./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 

From aa2162066545bc9a32bcc7f6116c09949fcee0c3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 15:53:23 -0400
Subject: [PATCH 31/40] ci: allow Frontier jobs to fail without blocking
 workflow

CCE 19.0.0 (cpe/25.03 upgrade) has a compiler bug (IPA SIGSEGV in
m_phase_change.fpp) that causes all Frontier builds to fail. This is
a pre-existing upstream Cray issue unrelated to this PR.

Set continue-on-error conditionally so Frontier matrix entries show
orange/warning while Phoenix and GitHub runner results remain blocking.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 1 +
 .github/workflows/test.yml  | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 8eede84ae8..8348c860dd 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -85,6 +85,7 @@ jobs:
             device: gpu
             interface: omp
             build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
+    continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e6002f5e3b..ed544b06d0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -165,7 +165,7 @@ jobs:
     name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true
     needs: [lint-gate, file-changes]
-    continue-on-error: false
+    continue-on-error: ${{ matrix.runner == 'frontier' }}
     timeout-minutes: 480
     strategy:
       matrix:
@@ -293,7 +293,7 @@ jobs:
     name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true
     needs: [lint-gate, file-changes]
-    continue-on-error: false
+    continue-on-error: ${{ matrix.runner == 'frontier' }}
     timeout-minutes: 480
     strategy:
       matrix:

From 18311b83dcc0b4d1891cc8ef14b85b2059f1a8fc Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 16:11:56 -0400
Subject: [PATCH 32/40] ci: fix shellcheck SC2162 - use read -r in while loops

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 2 +-
 .github/workflows/test.yml  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 8348c860dd..5f4c5003e6 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -127,7 +127,7 @@ jobs:
       - name: Cancel SLURM Jobs
         if: cancelled()
         run: |
-          find . -name "*.slurm_job_id" | while read f; do
+          find . -name "*.slurm_job_id" | while read -r f; do
             job_id=$(cat "$f")
             echo "Cancelling SLURM job $job_id"
             scancel "$job_id" 2>/dev/null || true
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ed544b06d0..3a84d12f41 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -261,7 +261,7 @@ jobs:
       - name: Cancel SLURM Jobs
         if: cancelled()
         run: |
-          find . -name "*.slurm_job_id" | while read f; do
+          find . -name "*.slurm_job_id" | while read -r f; do
             job_id=$(cat "$f")
             echo "Cancelling SLURM job $job_id"
             scancel "$job_id" 2>/dev/null || true
@@ -346,7 +346,7 @@ jobs:
       - name: Cancel SLURM Jobs
         if: cancelled()
         run: |
-          find . -name "*.slurm_job_id" | while read f; do
+          find . -name "*.slurm_job_id" | while read -r f; do
             job_id=$(cat "$f")
             echo "Cancelling SLURM job $job_id"
             scancel "$job_id" 2>/dev/null || true

From f572dcfea49c02657a9026f54ccbccbc4e828532 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 21:24:51 -0400
Subject: [PATCH 33/40] bench: prefer rtx6000/l40s/v100 over h200/h100/a100 for
 GPU partition

High-end HPC partitions (H200, H100, A100) are high-demand and give
inconsistent availability. RTX 6000 (35 nodes) and L40S are more
consistently available, giving more reproducible benchmark timings.
Fallback also updated from gpu-l40s to gpu-rtx6000.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_parallel_benchmarks.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index 9b9d00369b..fa8c9bda77 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -25,7 +25,7 @@ echo "=========================================="
 if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
     echo "Selecting Phoenix GPU partition for benchmark consistency..."
     BENCH_GPU_PARTITION=""
-    for part in gpu-h200 gpu-h100 gpu-a100 gpu-l40s gpu-rtx6000 gpu-v100; do
+    for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
         # || true: grep -c exits 1 on zero matches (or when sinfo returns no output
         # for an unknown partition); suppress so set -euo pipefail doesn't abort.
         idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
@@ -37,7 +37,7 @@ if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
     done
     if [ -z "$BENCH_GPU_PARTITION" ]; then
         echo "WARNING: No idle GPU partition found; falling back to gpu-l40s (may queue)"
-        BENCH_GPU_PARTITION="gpu-l40s"
+        BENCH_GPU_PARTITION="gpu-rtx6000"
     fi
     export BENCH_GPU_PARTITION
 fi

From 8f298d17c0890d602e466a3031b30631273f6dec Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 22:12:46 -0400
Subject: [PATCH 34/40] ci: decouple SLURM submit from monitor for Phoenix jobs
 (Option 2)

Split Phoenix test and case-optimization CI steps into separate Submit
and Monitor phases so a transient GHA connectivity drop cannot waste
compute: the Submit step is idempotent (exits immediately after writing
the job ID file), and the Monitor step re-attaches to an existing RUNNING/
PENDING SLURM job on rerun rather than submitting a new one.

- New submit-job.sh: idempotent sbatch submission; writes job ID to
  <slug>.slurm_job_id and exits immediately; skips resubmission if a
  live SLURM job for this slug is already RUNNING/PENDING
- Refactored submit.sh: thin wrapper that calls submit-job.sh then
  run_monitored_slurm_job.sh (backward-compatible; bench and Frontier
  callers unchanged)
- test.yml self job: clean: false on checkout so .slurm_job_id survives
  reruns; Test step split into Submit SLURM Test Job + Monitor SLURM
  Test Job (phoenix only); Frontier unchanged
- test.yml case-optimization: Run Case-Optimization Tests split into
  Submit + Monitor (phoenix only); Frontier unchanged; Pre-Build (SLURM)
  gets idempotency automatically via refactored submit.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit-job.sh | 123 ++++++++++++++++++++++++
 .github/workflows/phoenix/submit.sh     |  90 +++--------------
 .github/workflows/test.yml              |  24 ++++-
 3 files changed, 157 insertions(+), 80 deletions(-)
 create mode 100755 .github/workflows/phoenix/submit-job.sh

diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh
new file mode 100755
index 0000000000..a4a63d43a9
--- /dev/null
+++ b/.github/workflows/phoenix/submit-job.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# Submit a SLURM job without waiting for it to complete.
+# Writes the job ID to <job_slug>.slurm_job_id so a separate monitor step can wait.
+# Idempotent: if a job for this slug is still RUNNING or PENDING, skip resubmission.
+#
+# Usage: submit-job.sh [script.sh] [cpu|gpu] [none|acc|omp]
+
+set -euo pipefail
+
+# Ignore SIGHUP to survive login node session drops
+trap '' HUP
+
+usage() {
+    echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]"
+}
+
+if [ -z "${1:-}" ]; then
+    usage
+    exit 1
+fi
+
+sbatch_script_contents=$(cat "$1")
+
+# Detect job type from submitted script basename
+script_basename="$(basename "$1" .sh)"
+case "$script_basename" in
+    bench*) job_type="bench" ;;
+    *)      job_type="test"  ;;
+esac
+
+sbatch_cpu_opts="\
+#SBATCH -p cpu-small               # partition
+#SBATCH --ntasks-per-node=24       # Number of cores per node required
+#SBATCH --mem-per-cpu=2G           # Memory per core\
+"
+
+if [ "$job_type" = "bench" ]; then
+    bench_partition="${BENCH_GPU_PARTITION:-gpu-rtx6000}"
+    echo "Submitting bench GPU job to partition: $bench_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-<unset, using default>})"
+    sbatch_gpu_opts="\
+#SBATCH -p $bench_partition
+#SBATCH --ntasks-per-node=4       # Number of cores per node required
+#SBATCH -G2\
+"
+    sbatch_time="#SBATCH -t 04:00:00"
+else
+    sbatch_gpu_opts="\
+#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s,gpu-h200
+#SBATCH --ntasks-per-node=4       # Number of cores per node required
+#SBATCH -G2\
+"
+    sbatch_time="#SBATCH -t 03:00:00"
+fi
+
+if [ "$2" = "cpu" ]; then
+    sbatch_device_opts="$sbatch_cpu_opts"
+elif [ "$2" = "gpu" ]; then
+    sbatch_device_opts="$sbatch_gpu_opts"
+else
+    usage
+    exit 1
+fi
+
+job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+output_file="$job_slug.out"
+id_file="${job_slug}.slurm_job_id"
+
+# Idempotency: if a live job already exists for this slug, skip resubmission.
+# Only RUNNING/PENDING jobs are reused — a COMPLETED/FAILED job means we should
+# run fresh (e.g. new commit pushed) or the monitor step will verify it separately.
+if [ -f "$id_file" ]; then
+    existing_id=$(cat "$id_file")
+    state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
+    case "${state:-UNKNOWN}" in
+        RUNNING|PENDING|REQUEUED|COMPLETING)
+            echo "Reusing existing SLURM job $existing_id (state=$state) — skipping resubmission"
+            exit 0
+            ;;
+        *)
+            echo "Stale job $existing_id (state=${state:-UNKNOWN}) — resubmitting"
+            rm -f "$id_file"
+            ;;
+    esac
+fi
+
+submit_output=$(sbatch <<EOT
+#!/bin/bash
+#SBATCH -Jshb-$job_slug            # Job name
+#SBATCH --account=gts-sbryngelson3 # charge account
+#SBATCH -N1                        # Number of nodes required
+$sbatch_device_opts
+$sbatch_time
+#SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
+#SBATCH -o$output_file             # Combined output and error messages file
+
+set -e
+set -x
+
+cd "\$SLURM_SUBMIT_DIR"
+echo "Running in $(pwd):"
+
+job_slug="$job_slug"
+job_device="$2"
+job_interface="$3"
+
+. ./mfc.sh load -c p -m $2
+
+$sbatch_script_contents
+
+EOT
+)
+
+job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+if [ -z "$job_id" ]; then
+    echo "ERROR: Failed to submit job. sbatch output:"
+    echo "$submit_output"
+    exit 1
+fi
+
+echo "Submitted batch job $job_id"
+echo "$job_id" > "$id_file"
+echo "Job ID written to $id_file"
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index be1ab34258..945db21fbc 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
+# Submit a SLURM job and wait for it to complete.
+# Delegates submission (with idempotency) to submit-job.sh, then monitors.
+#
+# Usage: submit.sh [script.sh] [cpu|gpu] [none|acc|omp]
 
 set -euo pipefail
 
@@ -9,92 +13,20 @@ usage() {
     echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]"
 }
 
-if [ ! -z "$1" ]; then
-    sbatch_script_contents=$(cat "$1")
-else
+if [ -z "${1:-}" ]; then
     usage
     exit 1
 fi
 
-# Detect job type from submitted script basename
-script_basename="$(basename "$1" .sh)"
-case "$script_basename" in
-    bench*) job_type="bench" ;;
-    *)      job_type="test"  ;;
-esac
-
-sbatch_cpu_opts="\
-#SBATCH -p cpu-small               # partition
-#SBATCH --ntasks-per-node=24       # Number of cores per node required
-#SBATCH --mem-per-cpu=2G           # Memory per core\
-"
-
-if [ "$job_type" = "bench" ]; then
-    bench_partition="${BENCH_GPU_PARTITION:-gpu-l40s}"
-    echo "Submitting bench GPU job to partition: $bench_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-<unset, using default>})"
-    sbatch_gpu_opts="\
-#SBATCH -p $bench_partition
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
-#SBATCH -G2\
-"
-    sbatch_time="#SBATCH -t 04:00:00"
-else
-    sbatch_gpu_opts="\
-#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s,gpu-h200
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
-#SBATCH -G2\
-"
-    sbatch_time="#SBATCH -t 03:00:00"
-fi
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-if [ "$2" = "cpu" ]; then
-    sbatch_device_opts="$sbatch_cpu_opts"
-elif [ "$2" = "gpu" ]; then
-    sbatch_device_opts="$sbatch_gpu_opts"
-else
-    usage
-    exit 1
-fi
+# Submit (idempotent — skips resubmission if a live job already exists)
+bash "$SCRIPT_DIR/submit-job.sh" "$@"
 
+# Derive the same job slug and file paths as submit-job.sh
 job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
 output_file="$job_slug.out"
+id_file="${job_slug}.slurm_job_id"
 
-submit_output=$(sbatch <<EOT
-#!/bin/bash
-#SBATCH -Jshb-$job_slug            # Job name
-#SBATCH --account=gts-sbryngelson3 # charge account
-#SBATCH -N1                        # Number of nodes required
-$sbatch_device_opts
-$sbatch_time
-#SBATCH -q embers                  # QOS Name
-#SBATCH --requeue                  # Auto-requeue on preemption
-#SBATCH -o$output_file             # Combined output and error messages file
-
-set -e
-set -x
-
-cd "\$SLURM_SUBMIT_DIR"
-echo "Running in $(pwd):"
-
-job_slug="$job_slug"
-job_device="$2"
-job_interface="$3"
-
-. ./mfc.sh load -c p -m $2
-
-$sbatch_script_contents
-
-EOT
-)
-
-job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
-if [ -z "$job_id" ]; then
-    echo "ERROR: Failed to submit job. sbatch output:"
-    echo "$submit_output"
-    exit 1
-fi
-
-echo "Submitted batch job $job_id"
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+job_id=$(cat "$id_file")
 bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3a84d12f41..084f0d884f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -243,7 +243,7 @@ jobs:
       - name: Clone
         uses: actions/checkout@v4
         with:
-          clean: true
+          clean: false
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
@@ -255,7 +255,18 @@ jobs:
           command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
           on_retry_command: rm -rf build
 
+      - name: Submit SLURM Test Job
+        if:   matrix.cluster == 'phoenix'
+        run:  bash .github/workflows/phoenix/submit-job.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} ${{ matrix.interface }}
+
+      - name: Monitor SLURM Test Job
+        if:   matrix.cluster == 'phoenix'
+        run: |
+          slug="test-${{ matrix.device }}-${{ matrix.interface }}"
+          bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out"
+
       - name: Test
+        if:   matrix.cluster != 'phoenix'
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
 
       - name: Cancel SLURM Jobs
@@ -340,7 +351,18 @@ jobs:
         if:   matrix.cluster != 'phoenix'
         run:  bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
 
+      - name: Submit Case-Optimization Tests
+        if:   matrix.cluster == 'phoenix'
+        run:  bash .github/workflows/phoenix/submit-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}
+
+      - name: Monitor Case-Optimization Tests
+        if:   matrix.cluster == 'phoenix'
+        run: |
+          slug="run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}"
+          bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out"
+
       - name: Run Case-Optimization Tests
+        if:   matrix.cluster != 'phoenix'
         run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Cancel SLURM Jobs

From 38df3837eb6ba33feef0d080f451e1e340f59fbe Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 23:16:47 -0400
Subject: [PATCH 35/40] ci: fix --precision flag and remove Python 3.14 step in
 github job

--${{ matrix.precision }} expands to bare '--' when precision is empty,
which is parsed as an argument separator. Restore the PRECISION env var
approach so empty precision is correctly omitted.

Remove the manual 'Set up Python 3.14' step; the lint-gate job already
pins Python 3.12 via setup-python and the github runners use the
system default for the build/test steps.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b9fe5ac8a1..31634a8302 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -137,16 +137,12 @@ jobs:
           printenv | sort > /tmp/env_after
           diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV
 
-      - name: Set up Python 3.14
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.14'
-
       - name: Build
         run:  |
-          /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
+          /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} $PRECISION $TEST_ALL
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
+          PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }}
 
       - name: Test
         run:  |

From 07c4ab00406689185a04c9fb0d18fc9049c5b89b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 23:18:10 -0400
Subject: [PATCH 36/40] ci: fix fallback partition message, remove dead
 RETRY_CLEAN_CMD, fix precision flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- run_parallel_benchmarks.sh: fix warning message to say gpu-rtx6000
  (not gpu-l40s) to match the actual fallback value
- phoenix/bench.sh: remove RETRY_CLEAN_CMD prefix on retry_build —
  retry-build.sh only uses RETRY_VALIDATE_CMD and always does rm -rf build
  on retry; variable was dead and re-introduced by upstream merge
- test.yml: fix --${{ matrix.precision }} expanding to bare '--' when
  precision is empty; restore PRECISION env var pattern; remove manual
  'Set up Python 3.14' step

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_parallel_benchmarks.sh | 2 +-
 .github/workflows/phoenix/bench.sh         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index fa8c9bda77..eaa9e68f5f 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -36,7 +36,7 @@ if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
         fi
     done
     if [ -z "$BENCH_GPU_PARTITION" ]; then
-        echo "WARNING: No idle GPU partition found; falling back to gpu-l40s (may queue)"
+        echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
         BENCH_GPU_PARTITION="gpu-rtx6000"
     fi
     export BENCH_GPU_PARTITION
diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 218cf68a5f..abaf76f33d 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -22,7 +22,7 @@ fi
 rm -rf build
 
 source .github/scripts/retry-build.sh
-RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
+retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 
 ./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 

From 1c81fc05c56d7682dca5aecdbdbb09a861959b9b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 23:26:53 -0400
Subject: [PATCH 37/40] ci: submit-job.sh always submits fresh, cancels any
 stale SLURM job first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On rerun, the intent is to cancel the old job and run a fresh one —
not to reuse an existing running job. Remove the early-exit that skipped
resubmission for RUNNING/PENDING jobs; instead scancel any live job as a
safety net (in case the 'Cancel SLURM Jobs' step did not fire due to
SIGKILL), then always fall through to a fresh sbatch submission.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit-job.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh
index a4a63d43a9..9e0247bd11 100755
--- a/.github/workflows/phoenix/submit-job.sh
+++ b/.github/workflows/phoenix/submit-job.sh
@@ -65,22 +65,22 @@ job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
 output_file="$job_slug.out"
 id_file="${job_slug}.slurm_job_id"
 
-# Idempotency: if a live job already exists for this slug, skip resubmission.
-# Only RUNNING/PENDING jobs are reused — a COMPLETED/FAILED job means we should
-# run fresh (e.g. new commit pushed) or the monitor step will verify it separately.
+# On rerun, cancel any existing job for this slug and submit a fresh one.
+# If the job is still live (RUNNING/PENDING), scancel it first as a safety net
+# in case the "Cancel SLURM Jobs" step did not fire (e.g. runner was SIGKILL'd).
 if [ -f "$id_file" ]; then
     existing_id=$(cat "$id_file")
     state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
     case "${state:-UNKNOWN}" in
         RUNNING|PENDING|REQUEUED|COMPLETING)
-            echo "Reusing existing SLURM job $existing_id (state=$state) — skipping resubmission"
-            exit 0
+            echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission"
+            scancel "$existing_id" 2>/dev/null || true
             ;;
         *)
-            echo "Stale job $existing_id (state=${state:-UNKNOWN}) — resubmitting"
-            rm -f "$id_file"
+            echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh"
             ;;
     esac
+    rm -f "$id_file"
 fi
 
 submit_output=$(sbatch <<EOT

From 0a398034798ef629e851d461ca86afbe10d97dc4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 23:38:30 -0400
Subject: [PATCH 38/40] ci: fix heredoc pwd expansion, backtick substitution,
 combine bench log steps

- submit-job.sh: escape $(pwd) in heredoc so it expands on the compute
  node after cd $SLURM_SUBMIT_DIR, not on the login node at sbatch time
- submit-job.sh: replace backtick command substitution with $() for
  job_slug (modern bash style, consistent with rest of script)
- bench.yml: combine 'Print Per-Case Failure Logs' and 'Print Per-Case
  Success Logs' into a single 'Print Per-Case Logs' step that labels
  each output as [PASSED] or [FAILED] inline

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml             | 18 ++++--------------
 .github/workflows/phoenix/submit-job.sh |  4 ++--
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 5f4c5003e6..24576c1507 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -145,19 +145,7 @@ jobs:
           cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
           cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
 
-      - name: Print Per-Case Failure Logs
-        if: always()
-        run: |
-          for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
-            [ -f "$out" ] || continue
-            yaml="${out%.out}.yaml"
-            if [ ! -f "$yaml" ]; then
-              echo "=== [FAILED] $out ==="
-              cat "$out"
-            fi
-          done
-
-      - name: Print Per-Case Success Logs
+      - name: Print Per-Case Logs
         if: always()
         run: |
           for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
@@ -165,8 +153,10 @@ jobs:
             yaml="${out%.out}.yaml"
             if [ -f "$yaml" ]; then
               echo "=== [PASSED] $out ==="
-              cat "$out"
+            else
+              echo "=== [FAILED] $out ==="
             fi
+            cat "$out"
           done
 
       # All other runners (non-Phoenix) just run without special env
diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh
index 9e0247bd11..caa6bd2175 100755
--- a/.github/workflows/phoenix/submit-job.sh
+++ b/.github/workflows/phoenix/submit-job.sh
@@ -61,7 +61,7 @@ else
     exit 1
 fi
 
-job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3"
 output_file="$job_slug.out"
 id_file="${job_slug}.slurm_job_id"
 
@@ -98,7 +98,7 @@ set -e
 set -x
 
 cd "\$SLURM_SUBMIT_DIR"
-echo "Running in $(pwd):"
+echo "Running in \$(pwd):"
 
 job_slug="$job_slug"
 job_device="$2"

From e686654647e2b2ec7ed96a6051f9a1783fcfa8d6 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 8 Mar 2026 23:57:43 -0400
Subject: [PATCH 39/40] ci: remove redundant slurm_job_id write, improve bench
 log output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- run_monitored_slurm_job.sh: remove redundant .slurm_job_id write;
  submit-job.sh already writes it before the monitor is called, so the
  second write was a no-op in all code paths
- bench.yml: replace two separate per-case log steps with one organized
  step: prints a summary table (N failed, N passed) with all case names
  first, then full logs only for failed cases — passing runs stay concise

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_monitored_slurm_job.sh |  4 ----
 .github/workflows/bench.yml                | 23 +++++++++++++++-------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
index d7c2e22704..6fb9e254ec 100644
--- a/.github/scripts/run_monitored_slurm_job.sh
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -18,10 +18,6 @@ output_file="$2"
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-# Write job ID next to the output file so the workflow cancel step can scancel it
-# if GitHub Actions terminates the runner (SIGKILL cannot be caught by trap).
-echo "$job_id" > "${output_file%.out}.slurm_job_id"
-
 monitor_exit=0
 bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
 
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 24576c1507..8a1c848493 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -148,17 +148,26 @@ jobs:
       - name: Print Per-Case Logs
         if: always()
         run: |
+          passed=() failed=()
           for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
             [ -f "$out" ] || continue
-            yaml="${out%.out}.yaml"
-            if [ -f "$yaml" ]; then
-              echo "=== [PASSED] $out ==="
-            else
-              echo "=== [FAILED] $out ==="
-            fi
-            cat "$out"
+            [ -f "${out%.out}.yaml" ] && passed+=("$out") || failed+=("$out")
           done
 
+          echo "=== Per-Case Summary: ${#failed[@]} failed, ${#passed[@]} passed ==="
+          for out in "${failed[@]}"; do echo "  [FAILED] $out"; done
+          for out in "${passed[@]}"; do echo "  [PASSED] $out"; done
+
+          if [ ${#failed[@]} -gt 0 ]; then
+            echo ""
+            echo "=== Failed Case Logs ==="
+            for out in "${failed[@]}"; do
+              echo "--- $out ---"
+              cat "$out"
+              echo ""
+            done
+          fi
+
       # All other runners (non-Phoenix) just run without special env
       - name: Archive Logs (Frontier)
         if: always() && matrix.cluster != 'phoenix'

From b97320b6b23b6988f3cd84c999771c4a4df2c56f Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 9 Mar 2026 00:05:27 -0400
Subject: [PATCH 40/40] ci: add explanatory comments, fix backtick in submit.sh

- submit.sh: replace backtick with $() for job_slug; add comment that
  the sed pipeline must stay in sync with submit-job.sh
- test.yml: explain clean: false (preserves .slurm_job_id for stale job
  detection) and continue-on-error on Frontier (CCE compiler instability)
- run_parallel_benchmarks.sh: explain GPU partition priority order
  (prefer smaller/older partitions to leave large nodes for production)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_parallel_benchmarks.sh | 3 +++
 .github/workflows/phoenix/submit.sh        | 6 ++++--
 .github/workflows/test.yml                 | 6 ++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh
index eaa9e68f5f..8c562b911e 100755
--- a/.github/scripts/run_parallel_benchmarks.sh
+++ b/.github/scripts/run_parallel_benchmarks.sh
@@ -24,6 +24,9 @@ echo "=========================================="
 # both parallel jobs so PR and master always land on the same GPU type.
 if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
     echo "Selecting Phoenix GPU partition for benchmark consistency..."
+    # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
+    # large modern nodes (h200, h100, a100) free for production workloads.
+    # rtx6000 has the most nodes and gives the most consistent baselines.
     BENCH_GPU_PARTITION=""
     for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
         # || true: grep -c exits 1 on zero matches (or when sinfo returns no output
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 945db21fbc..0c009bd001 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -23,8 +23,10 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Submit (idempotent — skips resubmission if a live job already exists)
 bash "$SCRIPT_DIR/submit-job.sh" "$@"
 
-# Derive the same job slug and file paths as submit-job.sh
-job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+# Derive the same job slug and file paths as submit-job.sh.
+# NOTE: this sed pipeline must stay identical to the one in submit-job.sh —
+# if they diverge the id-file will not be found and the monitor will fail.
+job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3"
 output_file="$job_slug.out"
 id_file="${job_slug}.slurm_job_id"
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 31634a8302..9ce6dda24c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -155,6 +155,9 @@ jobs:
     name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true
     needs: [lint-gate, file-changes]
+    # Frontier CCE compiler is periodically broken by toolchain updates (e.g.
+    # cpe/25.03 introduced an IPA SIGSEGV in CCE 19.0.0). Allow Frontier to
+    # fail without blocking PR merges; Phoenix remains a hard gate.
     continue-on-error: ${{ matrix.runner == 'frontier' }}
     timeout-minutes: 480
     strategy:
@@ -233,6 +236,8 @@ jobs:
       - name: Clone
         uses: actions/checkout@v4
         with:
+          # clean: false preserves .slurm_job_id files across reruns so
+          # submit-job.sh can detect and cancel stale SLURM jobs on retry.
           clean: false
 
       - name: Build
@@ -294,6 +299,7 @@ jobs:
     name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true
     needs: [lint-gate, file-changes]
+    # Frontier is non-blocking for the same reason as the self job above.
     continue-on-error: ${{ matrix.runner == 'frontier' }}
     timeout-minutes: 480
     strategy: