From b9da01e17ac3d2ae5105afb0e1a6c8b116080b4b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 02:47:53 -0500 Subject: [PATCH 01/27] Work around CCE 19.0.0 compiler bugs for Cray+OpenACC builds Three distinct CCE 19.0.0 compiler bugs required fixes: Bug 1: InstCombine ICE in matmul() in m_phase_change.fpp - Replace matmul() with explicit 2x2 arithmetic Bug 2: IPA bring_routine_resident SIGSEGV in m_phase_change.fpp - Add -Oipa0 per-file in CMakeLists.txt (Cray+OpenACC only) - Use cray_noinline=True on 4 GPU_ROUTINE calls in m_phase_change.fpp and 4 in m_variables_conversion.fpp Bug 3: IPA castIsValid ICE in m_bubbles_EL.fpp - Change proc_bubble_counts from VLA to allocatable - Add -Oipa0 per-file in CMakeLists.txt (Cray+OpenACC only) Bug 4: m_chemistry.fpp VLA ICE in case-optimized pre_process builds - Guard 4 dimension(num_species) local arrays with USING_CCE Bug 5: Pyrometheus GPU_ROUTINE macro missing !acc routine seq on Cray+ACC - Post-process generated m_thermochem.f90 in toolchain/mfc/run/input.py to replace the broken Cray INLINEALWAYS-only macro with plain #define GPU_ROUTINE(name) !acc routine seq Also fix uninitialized FT in s_TSat (use huge(1.0_wp) not huge(FT)). See PR #1286. --- CMakeLists.txt | 14 ++++++++++++ src/common/include/parallel_macros.fpp | 30 ++++++++++++++++++++++++-- src/common/m_chemistry.fpp | 16 ++++++++++---- src/common/m_phase_change.fpp | 27 ++++++++++++++++++----- src/common/m_variables_conversion.fpp | 8 +++---- src/simulation/m_bubbles_EL.fpp | 7 ++++-- toolchain/mfc/run/input.py | 22 ++++++++++++++----- 7 files changed, 102 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddb3876724..b90ed03322 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -397,6 +397,7 @@ HANDLE_SOURCES(simulation ON) HANDLE_SOURCES(post_process ON) HANDLE_SOURCES(syscheck OFF) + # MFC_SETUP_TARGET: Given a target (herein ), this macro creates a new # executable with the appropriate sources, compiler definitions, and # linked libraries (assuming HANDLE_SOURCES was called on ). @@ -633,6 +634,19 @@ if (MFC_SIMULATION) MFC_SETUP_TARGET(TARGET simulation SOURCES "${simulation_SRCs}" MPI FFTW OpenACC OpenMP) + # CCE 19.0.0 IPA workaround: two files trigger IPA crashes: + # m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI) + # m_phase_change: bring_routine_resident SIGSEGV + # Disabling IPA per-file avoids the crashes while preserving IPA for + # the rest of simulation (needed for thermochem INLINEALWAYS inlining). + # See PR #1286. + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND MFC_OpenACC) + set_source_files_properties( + "${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90" + "${CMAKE_BINARY_DIR}/fypp/simulation/m_phase_change.fpp.f90" + PROPERTIES COMPILE_FLAGS "-Oipa0" + ) + endif() endif() if (MFC_POST_PROCESS) diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index c5ad5c1fb3..e1cd3ff30b 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -48,18 +48,44 @@ #:enddef -#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None, extraOmpArgs=None) +#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, cray_noinline=False, extraAccArgs=None, extraOmpArgs=None) #:assert isinstance(cray_inline, bool) + #:assert isinstance(cray_noinline, bool) + #:assert not (cray_inline and cray_noinline), "cray_inline and cray_noinline are mutually exclusive" #:set acc_directive = ACC_ROUTINE(function_name=function_name, parallelism=parallelism, nohost=nohost, extraAccArgs=extraAccArgs) #:set omp_directive = OMP_ROUTINE(function_name=function_name, nohost=nohost, extraOmpArgs=extraOmpArgs) - #:if cray_inline == True + #:if cray_noinline == True + #:if not isinstance(function_name, str) + #:stop "When using cray_noinline, function name must be given and given as a string" + #:endif + #:set cray_noinline_directive = ('!DIR$ NOINLINE ' + function_name).strip('\n') +#ifdef _CRAYFTN +#if MFC_OpenACC + $:acc_directive +#elif MFC_OpenMP + $:omp_directive +#else + $:cray_noinline_directive +#endif +#elif MFC_OpenACC + $:acc_directive +#elif MFC_OpenMP + $:omp_directive +#endif + #:elif cray_inline == True #:if not isinstance(function_name, str) #:stop "When inlining for Cray Compiler, function name must be given and given as a string" #:endif #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n') #ifdef _CRAYFTN +#if MFC_OpenACC + $:acc_directive +#elif MFC_OpenMP + $:omp_directive +#else $:cray_directive +#endif #elif MFC_OpenACC $:acc_directive #elif MFC_OpenMP diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index d7ffbc3cfe..432b1eb02e 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -63,7 +63,11 @@ contains integer :: x, y, z, eqn real(wp) :: energy, T_in - real(wp), dimension(num_species) :: Ys + #:if USING_CCE + real(wp), dimension(10) :: Ys + #:else + real(wp), dimension(num_species) :: Ys + #:endif do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end @@ -101,7 +105,11 @@ contains type(int_bounds_info), dimension(1:3), intent(in) :: bounds integer :: x, y, z, i - real(wp), dimension(num_species) :: Ys + #:if USING_CCE + real(wp), dimension(10) :: Ys + #:else + real(wp), dimension(num_species) :: Ys + #:endif real(wp) :: mix_mol_weight do z = bounds(3)%beg, bounds(3)%end @@ -131,7 +139,7 @@ contains integer :: eqn real(wp) :: T real(wp) :: rho, omega_m - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE real(wp), dimension(10) :: Ys real(wp), dimension(10) :: omega #:else @@ -180,7 +188,7 @@ contains type(int_bounds_info), intent(in) :: irx, iry, irz integer, intent(in) :: idir - #:if not MFC_CASE_OPTIMIZATION and USING_AMD + #:if (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp index 83b4801d3b..973945607d 100644 --- a/src/common/m_phase_change.fpp +++ b/src/common/m_phase_change.fpp @@ -104,6 +104,16 @@ contains !< Generic loop iterators integer :: i, j, k, l +#ifdef _CRAYFTN +#ifdef MFC_OpenACC + ! CCE 19 IPA workaround: prevent bring_routine_resident SIGSEGV + !DIR$ NOINLINE s_infinite_pt_relaxation_k + !DIR$ NOINLINE s_infinite_ptg_relaxation_k + !DIR$ NOINLINE s_correct_partial_densities + !DIR$ NOINLINE s_TSat +#endif +#endif + ! starting equilibrium solver $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok,pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF]') do j = 0, m @@ -296,7 +306,7 @@ contains !! @param TS equilibrium temperature at the interface subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, q_cons_vf, rhoe, TS) $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', & - & parallelism='[seq]', cray_inline=True) + & parallelism='[seq]', cray_noinline=True) ! initializing variables integer, intent(in) :: j, k, l, MFL @@ -411,7 +421,7 @@ contains !! @param TS equilibrium temperature at the interface subroutine s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS) $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', & - & parallelism='[seq]', cray_inline=True) + & parallelism='[seq]', cray_noinline=True) integer, intent(in) :: j, k, l real(wp), intent(inout) :: pS @@ -579,7 +589,8 @@ contains InvJac = InvJac/(Jac(1, 1)*Jac(2, 2) - Jac(1, 2)*Jac(2, 1)) ! calculating correction array for Newton's method - DeltamP = -1.0_wp*(matmul(InvJac, R2D)) + DeltamP(1) = -1.0_wp*(InvJac(1, 1)*R2D(1) + InvJac(1, 2)*R2D(2)) + DeltamP(2) = -1.0_wp*(InvJac(2, 1)*R2D(1) + InvJac(2, 2)*R2D(2)) ! updating two reacting 'masses'. Recall that inert 'masses' do not change during the phase change ! liquid @@ -638,7 +649,7 @@ contains !! @param l generic loop iterator for z direction subroutine s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l) $:GPU_ROUTINE(function_name='s_correct_partial_densities', & - & parallelism='[seq]', cray_inline=True) + & parallelism='[seq]', cray_noinline=True) !> @name variables for the correction of the reacting partial densities !> @{ @@ -689,7 +700,7 @@ contains !! @param TSIn equilibrium Temperature elemental subroutine s_TSat(pSat, TSat, TSIn) $:GPU_ROUTINE(function_name='s_TSat',parallelism='[seq]', & - & cray_inline=True) + & cray_noinline=True) real(wp), intent(in) :: pSat real(wp), intent(out) :: TSat @@ -716,6 +727,12 @@ contains ! underrelaxation factor Om = 1.0e-3_wp + + ! FT must be initialized before the do while condition is evaluated. + ! Fortran .or. is not short-circuit: abs(FT) is always evaluated even + ! when ns == 0, so FT must have a defined value here. + FT = huge(1.0_wp) + do while ((abs(FT) > ptgalpha_eps) .or. (ns == 0)) ! increasing counter ns = ns + 1 diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp index 9c4e72258f..632c8df1b0 100644 --- a/src/common/m_variables_conversion.fpp +++ b/src/common/m_variables_conversion.fpp @@ -116,7 +116,7 @@ contains !! @param pres_mag Magnetic pressure (optional) subroutine s_compute_pressure(energy, alf, dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T, stress, mom, G, pres_mag) $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', & - & cray_inline=True) + & cray_noinline=True) real(stp), intent(in) :: energy, alf real(wp), intent(in) :: dyn_p @@ -326,7 +326,7 @@ contains alpha_K, alpha_rho_K, Re_K, & G_K, G) $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_acc', & - & parallelism='[seq]', cray_inline=True) + & parallelism='[seq]', cray_noinline=True) real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K #:if not MFC_CASE_OPTIMIZATION and USING_AMD @@ -1335,7 +1335,7 @@ contains !> This subroutine computes partial densities and volume fractions subroutine s_compute_species_fraction(q_vf, k, l, r, alpha_rho_K, alpha_K) $:GPU_ROUTINE(function_name='s_compute_species_fraction', & - & parallelism='[seq]', cray_inline=True) + & parallelism='[seq]', cray_noinline=True) type(scalar_field), dimension(sys_size), intent(in) :: q_vf integer, intent(in) :: k, l, r #:if not MFC_CASE_OPTIMIZATION and USING_AMD @@ -1480,7 +1480,7 @@ contains !> @brief Computes the fast magnetosonic wave speed from the sound speed, density, and magnetic field components. subroutine s_compute_fast_magnetosonic_speed(rho, c, B, norm, c_fast, h) $:GPU_ROUTINE(function_name='s_compute_fast_magnetosonic_speed', & - & parallelism='[seq]', cray_inline=True) + & parallelism='[seq]', cray_noinline=True) real(wp), intent(in) :: B(3), rho, c real(wp), intent(in) :: h ! only used for relativity diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp index 53ad76f24b..4ae590a4b8 100644 --- a/src/simulation/m_bubbles_EL.fpp +++ b/src/simulation/m_bubbles_EL.fpp @@ -1532,8 +1532,7 @@ contains integer(KIND=MPI_OFFSET_KIND) :: disp integer :: view integer, dimension(2) :: gsizes, lsizes, start_idx_part - integer, dimension(num_procs) :: part_order, part_ord_mpi - integer, dimension(num_procs) :: proc_bubble_counts + integer, allocatable :: proc_bubble_counts(:) real(wp), dimension(1:1, 1:lag_io_vars) :: dummy dummy = 0._wp @@ -1548,6 +1547,8 @@ contains if (.not. parallel_io) return + allocate (proc_bubble_counts(num_procs)) + lsizes(1) = bub_id lsizes(2) = lag_io_vars @@ -1659,6 +1660,8 @@ contains call MPI_FILE_CLOSE(ifile, ierr) end if + deallocate (proc_bubble_counts) + #endif end subroutine s_write_restart_lag_bubbles diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index 79f32945b9..dc123cd9e3 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -90,13 +90,25 @@ def generate_fpp(self, target) -> None: directive_str = None # Write the generated Fortran code to the m_thermochem.f90 file with the chosen precision + thermochem_code = pyro.FortranCodeGenerator().generate( + "m_thermochem", + self.get_cantera_solution(), + pyro.CodeGenerationOptions(scalar_type = real_type, directive_offload = directive_str) + ) + + # CCE 19.0.0 workaround: pyrometheus generates !DIR$ INLINEALWAYS for Cray+ACC + # but omits !$acc routine seq, so thermochem routines are not registered as + # OpenACC device routines. Replace with plain !$acc routine seq (no INLINEALWAYS). + if directive_str == 'acc': + thermochem_code = thermochem_code.replace( + "#ifdef _CRAYFTN\n#define GPU_ROUTINE(name) !DIR$ INLINEALWAYS name\n" + "#else\n#define GPU_ROUTINE(name) !$acc routine seq\n#endif", + "#define GPU_ROUTINE(name) !$acc routine seq" + ) + common.file_write( os.path.join(modules_dir, "m_thermochem.f90"), - pyro.FortranCodeGenerator().generate( - "m_thermochem", - self.get_cantera_solution(), - pyro.CodeGenerationOptions(scalar_type = real_type, directive_offload = directive_str) - ), + thermochem_code, True ) From 1aa4cf58ff2a0a5f17bb101bc847a40a70625ab2 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 02:49:23 -0500 Subject: [PATCH 02/27] Temporarily disable Phoenix + Frontier AMD CI (pre-existing failures unrelated to CCE fix) --- .github/workflows/test.yml | 53 +++----------------------------------- 1 file changed, 4 insertions(+), 49 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fe549ac10f..b83f0ce78e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -170,22 +170,7 @@ jobs: strategy: matrix: include: - # Phoenix (GT) — build+test combined in SLURM job - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'acc' - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'omp' - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'cpu' - interface: 'none' + # Phoenix (GT) — TEMPORARILY DISABLED (pre-existing SLURM/Case Opt failures) # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition - runner: 'frontier' cluster: 'frontier' @@ -216,24 +201,7 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' - # Frontier AMD — build on login node, GPU tests sharded for batch partition - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' - shard: '1/2' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' - shard: '2/2' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'cpu' - interface: 'none' + # Frontier AMD — TEMPORARILY DISABLED (pre-existing failures unrelated to CCE fix) runs-on: group: phoenix labels: ${{ matrix.runner }} @@ -289,16 +257,8 @@ jobs: strategy: matrix: include: - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'acc' - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'omp' + # Phoenix (GT) — TEMPORARILY DISABLED (pre-existing SLURM/Case Opt failures) + # Frontier AMD — TEMPORARILY DISABLED (pre-existing failures unrelated to CCE fix) - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -309,11 +269,6 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'gpu' interface: 'omp' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' runs-on: group: phoenix labels: ${{ matrix.runner }} From 835a2b8b1a1086e23a94d739448046ee656f4a61 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 02:59:51 -0500 Subject: [PATCH 03/27] Address code review findings: bounds warning, assert patch applied, CMake COMPILE_OPTIONS - input.py: warn if cantera mechanism has >10 species (CCE dimension(10) limit) - input.py: assert that Cray+ACC GPU_ROUTINE macro patch was actually applied - CMakeLists.txt: COMPILE_FLAGS -> COMPILE_OPTIONS (deprecated property) --- CMakeLists.txt | 2 +- toolchain/mfc/run/input.py | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b90ed03322..d718e83685 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -644,7 +644,7 @@ if (MFC_SIMULATION) set_source_files_properties( "${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90" "${CMAKE_BINARY_DIR}/fypp/simulation/m_phase_change.fpp.f90" - PROPERTIES COMPILE_FLAGS "-Oipa0" + PROPERTIES COMPILE_OPTIONS "-Oipa0" ) endif() endif() diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index dc123cd9e3..38129a8995 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -90,9 +90,18 @@ def generate_fpp(self, target) -> None: directive_str = None # Write the generated Fortran code to the m_thermochem.f90 file with the chosen precision + sol = self.get_cantera_solution() + + # CCE 19.0.0 workaround: m_chemistry.fpp uses dimension(10) for local species arrays + # on Cray builds to avoid an InstCombine ICE. Warn if the mechanism exceeds this limit. + if sol.n_species > 10: + cons.print(f"[bold yellow]Warning:[/bold yellow] cantera mechanism has {sol.n_species} species > 10. " + "Cray Fortran (CCE) builds use a hardcoded dimension(10) workaround in " + "m_chemistry.fpp and will overflow on CCE. See PR #1286.") + thermochem_code = pyro.FortranCodeGenerator().generate( "m_thermochem", - self.get_cantera_solution(), + sol, pyro.CodeGenerationOptions(scalar_type = real_type, directive_offload = directive_str) ) @@ -100,11 +109,19 @@ def generate_fpp(self, target) -> None: # but omits !$acc routine seq, so thermochem routines are not registered as # OpenACC device routines. Replace with plain !$acc routine seq (no INLINEALWAYS). if directive_str == 'acc': - thermochem_code = thermochem_code.replace( + old_macro = ( "#ifdef _CRAYFTN\n#define GPU_ROUTINE(name) !DIR$ INLINEALWAYS name\n" - "#else\n#define GPU_ROUTINE(name) !$acc routine seq\n#endif", - "#define GPU_ROUTINE(name) !$acc routine seq" + "#else\n#define GPU_ROUTINE(name) !$acc routine seq\n#endif" ) + new_macro = "#define GPU_ROUTINE(name) !$acc routine seq" + patched = thermochem_code.replace(old_macro, new_macro) + if patched == thermochem_code: + raise common.MFCException( + "CCE 19.0.0 workaround: pyrometheus output format changed — " + "Cray+ACC GPU_ROUTINE macro patch did not apply. " + "Update the pattern in toolchain/mfc/run/input.py." + ) + thermochem_code = patched common.file_write( os.path.join(modules_dir, "m_thermochem.f90"), From ddcaa4a3aef62f982b0f11c9661d7c0cefc673ce Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 03:13:22 -0500 Subject: [PATCH 04/27] Temporarily disable Phoenix (NVHPC) benchmark jobs (QOS job limit issues) --- .github/workflows/bench.yml | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b45fc45e40..b9cad5e07f 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -37,30 +37,7 @@ jobs: fail-fast: false matrix: include: - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: cpu - interface: none - build_script: "" - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: gpu - interface: acc - build_script: "" - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: gpu - interface: omp - build_script: "" + # Phoenix (NVHPC) — TEMPORARILY DISABLED (QOS job limit / pre-existing SLURM failures) - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix From 631482010830a3a8c576be89216e38e02dcac7a3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 05:19:42 -0500 Subject: [PATCH 05/27] Address CodeRabbit review: CCE PROHIBIT guards + pyrometheus forward-compat Add @:PROHIBIT(num_species > 10) in all four USING_CCE blocks in m_chemistry.fpp so CCE builds with >10 species fail with a clear message rather than silently overflowing the fixed-size dimension(10) arrays (matching the existing AMD guard in m_checker_common.fpp). Make pyrometheus GPU_ROUTINE macro patch forward-compatible: if a future pyrometheus version already emits the correct form directly, skip the patch rather than raising an exception. Co-Authored-By: Claude Sonnet 4.6 --- src/common/m_chemistry.fpp | 16 ++++++++++++++++ toolchain/mfc/run/input.py | 13 ++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index 432b1eb02e..ec046a62d6 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -69,6 +69,10 @@ contains real(wp), dimension(num_species) :: Ys #:endif + #:if USING_CCE + @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + #:endif + do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end do x = bounds(1)%beg, bounds(1)%end @@ -112,6 +116,10 @@ contains #:endif real(wp) :: mix_mol_weight + #:if USING_CCE + @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + #:endif + do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end do x = bounds(1)%beg, bounds(1)%end @@ -147,6 +155,10 @@ contains real(wp), dimension(num_species) :: omega #:endif + #:if USING_CCE + @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + #:endif + $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]') do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end @@ -210,6 +222,10 @@ contains integer :: x, y, z, i, n, eqn integer, dimension(3) :: offsets + #:if USING_CCE + @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + #:endif + isc1 = irx; isc2 = iry; isc3 = irz $:GPU_UPDATE(device='[isc1,isc2,isc3]') diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index 38129a8995..b426ef7a27 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -116,11 +116,14 @@ def generate_fpp(self, target) -> None: new_macro = "#define GPU_ROUTINE(name) !$acc routine seq" patched = thermochem_code.replace(old_macro, new_macro) if patched == thermochem_code: - raise common.MFCException( - "CCE 19.0.0 workaround: pyrometheus output format changed — " - "Cray+ACC GPU_ROUTINE macro patch did not apply. " - "Update the pattern in toolchain/mfc/run/input.py." - ) + if new_macro in thermochem_code: + pass # pyrometheus already emits the correct form; no patch needed + else: + raise common.MFCException( + "CCE 19.0.0 workaround: pyrometheus output format changed — " + "Cray+ACC GPU_ROUTINE macro patch did not apply. " + "Update the pattern in toolchain/mfc/run/input.py." + ) thermochem_code = patched common.file_write( From c274109b06c14d9f88330b7e675f1479551d6a39 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 11:51:46 -0500 Subject: [PATCH 06/27] Add comment noting pyrometheus upstream issue for thermochem GPU_ROUTINE patch Co-Authored-By: Claude Sonnet 4.6 --- toolchain/mfc/run/input.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index b426ef7a27..cfd77cf26b 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -108,6 +108,8 @@ def generate_fpp(self, target) -> None: # CCE 19.0.0 workaround: pyrometheus generates !DIR$ INLINEALWAYS for Cray+ACC # but omits !$acc routine seq, so thermochem routines are not registered as # OpenACC device routines. Replace with plain !$acc routine seq (no INLINEALWAYS). + # This patch can be removed once pyrometheus upstream correctly emits !$acc routine seq + # for Cray+OpenACC (the broken macro originates in pyrometheus's code generator). if directive_str == 'acc': old_macro = ( "#ifdef _CRAYFTN\n#define GPU_ROUTINE(name) !DIR$ INLINEALWAYS name\n" From 1dadcc39f5fdf597237ae0042a6c0b0164c7a82c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 02:08:43 -0500 Subject: [PATCH 07/27] Fix Frontier benchmark SLURM: use batch+1:59+normal QOS Benchmark jobs were using the extended partition (5:59 walltime, ENG160 account) causing multi-hour queue waits and hitting GHA's 8h wall-clock limit. The actual benchmark runs in ~20 minutes on the node. Switch to batch + 1:59 + --qos=normal (same as the test suite jobs). Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 2 +- .github/workflows/frontier/submit.sh | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b9cad5e07f..76acfa80ff 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -65,7 +65,7 @@ jobs: runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} - timeout-minutes: 480 + timeout-minutes: 240 steps: - name: Clone - PR uses: actions/checkout@v4 diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 16d4f0d73c..8b914db03e 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -45,10 +45,10 @@ fi # Select SBATCH params based on job type if [ "$job_type" = "bench" ]; then - sbatch_account="#SBATCH -A ENG160" - sbatch_time="#SBATCH -t 05:59:00" - sbatch_partition="#SBATCH -p extended" - sbatch_extra="" + sbatch_account="#SBATCH -A CFD154" + sbatch_time="#SBATCH -t 01:59:00" + sbatch_partition="#SBATCH -p batch" + sbatch_extra="#SBATCH --qos=normal" else sbatch_account="#SBATCH -A CFD154" sbatch_time="#SBATCH -t 01:59:00" From e20827544644ef544f397559da2b9869e537d658 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 02:13:47 -0500 Subject: [PATCH 08/27] Fix bench.yml: restore timeout-minutes to 480 (revert accidental 240) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 76acfa80ff..b9cad5e07f 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -65,7 +65,7 @@ jobs: runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} - timeout-minutes: 240 + timeout-minutes: 480 steps: - name: Clone - PR uses: actions/checkout@v4 From 2d1b359f9b314a159e1b57284ffe058c4b78d494 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 04:17:16 -0500 Subject: [PATCH 09/27] Address review: CCE_MAX_SPECIES constant, GPU error for n_species overflow, noinline comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - m_chemistry.fpp: introduce CCE_MAX_SPECIES Fypp constant (= 10) as single source of truth; replace all 8 hardcoded dimension(10) and 4 PROHIBIT(> 10) occurrences with ${CCE_MAX_SPECIES}$ - input.py: elevate n_species > CCE_MAX_SPECIES from warning to MFCException for GPU builds (directive_str is not None); CPU builds still warn - parallel_macros.fpp: add comment explaining cray_noinline emits nothing on non-Cray CPU builds (intentional — !DIR$ NOINLINE is Cray-specific) Co-Authored-By: Claude Sonnet 4.6 --- src/common/include/parallel_macros.fpp | 2 ++ src/common/m_chemistry.fpp | 28 +++++++++++++++----------- toolchain/mfc/run/input.py | 19 +++++++++++------ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index e1cd3ff30b..dc7bab1705 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -68,6 +68,8 @@ #else $:cray_noinline_directive #endif +## On non-Cray CPU builds (no _CRAYFTN, no MFC_OpenACC, no MFC_OpenMP), nothing is +## emitted — intentional, since !DIR$ NOINLINE is a Cray-specific directive. #elif MFC_OpenACC $:acc_directive #elif MFC_OpenMP diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index ec046a62d6..80f6c4bbb8 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -6,6 +6,10 @@ #:include 'macros.fpp' #:include 'case.fpp' +## CCE 19.0.0 workaround: fixed-size array limit for local species arrays under _CRAYFTN. +## Must match the Python-side check in toolchain/mfc/run/input.py. See PR #1286. +#:set CCE_MAX_SPECIES = 10 + !> @brief Multi-species chemistry interface for thermodynamic properties, reaction rates, and transport coefficients module m_chemistry @@ -64,13 +68,13 @@ contains integer :: x, y, z, eqn real(wp) :: energy, T_in #:if USING_CCE - real(wp), dimension(10) :: Ys + real(wp), dimension(${CCE_MAX_SPECIES}$) :: Ys #:else real(wp), dimension(num_species) :: Ys #:endif #:if USING_CCE - @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") #:endif do z = bounds(3)%beg, bounds(3)%end @@ -110,14 +114,14 @@ contains integer :: x, y, z, i #:if USING_CCE - real(wp), dimension(10) :: Ys + real(wp), dimension(${CCE_MAX_SPECIES}$) :: Ys #:else real(wp), dimension(num_species) :: Ys #:endif real(wp) :: mix_mol_weight #:if USING_CCE - @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") #:endif do z = bounds(3)%beg, bounds(3)%end @@ -148,15 +152,15 @@ contains real(wp) :: T real(wp) :: rho, omega_m #:if (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE - real(wp), dimension(10) :: Ys - real(wp), dimension(10) :: omega + real(wp), dimension(${CCE_MAX_SPECIES}$) :: Ys + real(wp), dimension(${CCE_MAX_SPECIES}$) :: omega #:else real(wp), dimension(num_species) :: Ys real(wp), dimension(num_species) :: omega #:endif #:if USING_CCE - @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") #:endif $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]') @@ -201,10 +205,10 @@ contains integer, intent(in) :: idir #:if (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE - real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell - real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 - real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k - real(wp), dimension(10) :: Mass_Diffu_Flux, dYk_dxi + real(wp), dimension(${CCE_MAX_SPECIES}$) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell + real(wp), dimension(${CCE_MAX_SPECIES}$) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 + real(wp), dimension(${CCE_MAX_SPECIES}$) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k + real(wp), dimension(${CCE_MAX_SPECIES}$) :: Mass_Diffu_Flux, dYk_dxi #:else real(wp), dimension(num_species) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell real(wp), dimension(num_species) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 @@ -223,7 +227,7 @@ contains integer, dimension(3) :: offsets #:if USING_CCE - @:PROHIBIT(num_species > 10, "CCE 19.0.0 workaround: num_species must be <= 10 (fixed-size arrays in m_chemistry.fpp)") + @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") #:endif isc1 = irx; isc2 = iry; isc3 = irz diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index cfd77cf26b..bdd476365c 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -92,12 +92,19 @@ def generate_fpp(self, target) -> None: # Write the generated Fortran code to the m_thermochem.f90 file with the chosen precision sol = self.get_cantera_solution() - # CCE 19.0.0 workaround: m_chemistry.fpp uses dimension(10) for local species arrays - # on Cray builds to avoid an InstCombine ICE. Warn if the mechanism exceeds this limit. - if sol.n_species > 10: - cons.print(f"[bold yellow]Warning:[/bold yellow] cantera mechanism has {sol.n_species} species > 10. " - "Cray Fortran (CCE) builds use a hardcoded dimension(10) workaround in " - "m_chemistry.fpp and will overflow on CCE. See PR #1286.") + # CCE 19.0.0 workaround: m_chemistry.fpp uses dimension(CCE_MAX_SPECIES) for local + # species arrays on Cray builds to avoid an InstCombine ICE. Must match the Fypp + # constant CCE_MAX_SPECIES in src/common/m_chemistry.fpp. + CCE_MAX_SPECIES = 10 + if sol.n_species > CCE_MAX_SPECIES: + msg = (f"Cantera mechanism has {sol.n_species} species > {CCE_MAX_SPECIES}. " + f"Cray Fortran (CCE) builds use a hardcoded dimension({CCE_MAX_SPECIES}) " + "workaround in m_chemistry.fpp and will abort at runtime on CCE. See PR #1286.") + if directive_str is not None: + # GPU builds: hard error — the Fortran PROHIBIT will abort anyway, + # so fail early at input generation rather than at the first chemistry call. + raise common.MFCException(msg) + cons.print(f"[bold yellow]Warning:[/bold yellow] {msg}") thermochem_code = pyro.FortranCodeGenerator().generate( "m_thermochem", From 810056d473437e3dea313069b582d0791371491d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 07:42:18 -0500 Subject: [PATCH 10/27] Fix ##-> #! Fypp comment in m_chemistry.fpp top-level scope ## is only valid inside Fypp blocks (#:def, #:if). At file top-level it passes through to the .f90 output, causing gfortran CPP to error with 'invalid preprocessing directive ##'. Switch to #! which Fypp always strips. Co-Authored-By: Claude Sonnet 4.6 --- src/common/m_chemistry.fpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index 80f6c4bbb8..15228fddc7 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -6,8 +6,8 @@ #:include 'macros.fpp' #:include 'case.fpp' -## CCE 19.0.0 workaround: fixed-size array limit for local species arrays under _CRAYFTN. -## Must match the Python-side check in toolchain/mfc/run/input.py. See PR #1286. +#! CCE 19.0.0 workaround: fixed-size array limit for local species arrays under _CRAYFTN. +#! Must match the Python-side check in toolchain/mfc/run/input.py. See PR #1286. #:set CCE_MAX_SPECIES = 10 !> @brief Multi-species chemistry interface for thermodynamic properties, reaction rates, and transport coefficients From 8a6398c431be7f6ba047ba604447d74341986d76 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 11:08:58 -0500 Subject: [PATCH 11/27] Fix ##-> #! Fypp comment in parallel_macros.fpp cray_noinline block Same root cause as m_chemistry.fpp fix: ## is not a Fypp comment and passes through to the generated .f90 output. Inside #ifdef _CRAYFTN, gfortran never sees the ## lines (since _CRAYFTN is undefined there), but CCE does and errors with 'Unknown or unsupported compiler directive'. Change to #! which Fypp always strips. Co-Authored-By: Claude Sonnet 4.6 --- src/common/include/parallel_macros.fpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index dc7bab1705..a13bcbdfcb 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -68,8 +68,8 @@ #else $:cray_noinline_directive #endif -## On non-Cray CPU builds (no _CRAYFTN, no MFC_OpenACC, no MFC_OpenMP), nothing is -## emitted — intentional, since !DIR$ NOINLINE is a Cray-specific directive. + #! On non-Cray CPU builds (no _CRAYFTN, no MFC_OpenACC, no MFC_OpenMP), nothing is + #! emitted — intentional, since !DIR$ NOINLINE is a Cray-specific directive. #elif MFC_OpenACC $:acc_directive #elif MFC_OpenMP From 23309f627fb8df2c141186b9b2c2f4a0bd9a2e8d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 11:18:41 -0500 Subject: [PATCH 12/27] Extend -Oipa0 workaround to all Cray builds, not just Cray+OpenACC m_phase_change triggers a bring_routine_resident SIGSEGV (ftn-2116 INTERNAL) on CCE 19.0.0 CPU-only builds too, not just OpenACC GPU builds. Widen the CMakeLists guard from 'Cray AND MFC_OpenACC' to 'Cray' to fix the CCE CPU simulation build. See master CI run 22627725058 for the failure evidence. Co-Authored-By: Claude Sonnet 4.6 --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d718e83685..609b378773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -635,12 +635,12 @@ if (MFC_SIMULATION) SOURCES "${simulation_SRCs}" MPI FFTW OpenACC OpenMP) # CCE 19.0.0 IPA workaround: two files trigger IPA crashes: - # m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI) - # m_phase_change: bring_routine_resident SIGSEGV + # m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI) — GPU builds + # m_phase_change: bring_routine_resident SIGSEGV — all Cray builds (incl. CPU-only) # Disabling IPA per-file avoids the crashes while preserving IPA for # the rest of simulation (needed for thermochem INLINEALWAYS inlining). # See PR #1286. - if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND MFC_OpenACC) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") set_source_files_properties( "${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90" "${CMAKE_BINARY_DIR}/fypp/simulation/m_phase_change.fpp.f90" From 5d177b797ac28b2d2cc661dc05b2aba8539c234b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 12:47:33 -0500 Subject: [PATCH 13/27] Fix -Oipa0 guard: exclude Cray+OpenMP, cover Cray+OpenACC and Cray CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Cray+OpenMP, m_thermochem uses !DIR$ INLINEALWAYS (IPA inlining) so disabling IPA for m_phase_change/m_bubbles_EL breaks thermochem on-device calls → Phase Change and Lagrange Bubble tests crash at runtime (gpu-omp). On Cray+OpenACC, the pyrometheus patch emits !\ routine seq instead, so IPA is not needed for thermochem. On Cray CPU, GPU tests are skipped. Condition: Cray AND NOT MFC_OpenMP (covers OpenACC + CPU, excludes OpenMP). Co-Authored-By: Claude Sonnet 4.6 --- CMakeLists.txt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 609b378773..a3aa9a1444 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -635,12 +635,16 @@ if (MFC_SIMULATION) SOURCES "${simulation_SRCs}" MPI FFTW OpenACC OpenMP) # CCE 19.0.0 IPA workaround: two files trigger IPA crashes: - # m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI) — GPU builds - # m_phase_change: bring_routine_resident SIGSEGV — all Cray builds (incl. CPU-only) + # m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI) + # m_phase_change: bring_routine_resident SIGSEGV # Disabling IPA per-file avoids the crashes while preserving IPA for # the rest of simulation (needed for thermochem INLINEALWAYS inlining). + # Applied to Cray+OpenACC and Cray CPU, but NOT Cray+OpenMP: on OpenMP, + # m_thermochem uses !DIR$ INLINEALWAYS (requires IPA), so disabling IPA + # for these files breaks thermochem on-device calls. On OpenACC the + # pyrometheus patch emits !$acc routine seq instead (no IPA needed). # See PR #1286. - if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND NOT MFC_OpenMP) set_source_files_properties( "${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90" "${CMAKE_BINARY_DIR}/fypp/simulation/m_phase_change.fpp.f90" From 9fc072a02455584328540f516ffab7f5e410045b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 11:39:45 -0500 Subject: [PATCH 14/27] Remove persistent build cache for self-hosted test runners Replace setup-build-cache.sh symlink mechanism with rm -rf build before each test run on Phoenix and Frontier. Benchmark jobs unaffected. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/frontier/build.sh | 3 +-- .github/workflows/phoenix/test.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 88446ad2a0..6abb0cff8a 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,9 +20,8 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -# Only set up build cache for test suite, not benchmarks if [ "$run_bench" != "bench" ]; then - source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface" + rm -rf build fi source .github/scripts/retry-build.sh diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 6816bd9a25..c8a5af2132 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -3,8 +3,7 @@ source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" -# Set up persistent build cache -source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface" +rm -rf build # Build with retry; smoke-test cached binaries to catch architecture mismatches # (SIGILL from binaries compiled on a different compute node). From 2cdade93b2534264bff626c833c714f609e16f44 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 11:50:47 -0500 Subject: [PATCH 15/27] Remove build cache from benchmark jobs on Phoenix and Frontier --- .github/workflows/frontier/build.sh | 4 +--- .github/workflows/phoenix/bench.sh | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 6abb0cff8a..d21b1ddac4 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,9 +20,7 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -if [ "$run_bench" != "bench" ]; then - rm -rf build -fi +rm -rf build source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index 0eafc485d1..e91ece366b 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -15,6 +15,8 @@ else bench_opts="--mem 1" fi +rm -rf build + source .github/scripts/retry-build.sh RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1 From 6e97695e029b73acaf3f0d4bfdac420507d133c3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 14:10:18 -0500 Subject: [PATCH 16/27] Fix submit.sh to survive monitor SIGKILL by re-checking SLURM state When the runner process is killed (exit 137) before the SLURM job completes, sacct is used to verify the job's final state. If the SLURM job completed with exit 0:0, the CI step passes regardless of the monitor's exit code. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/phoenix/submit.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 5b7162fef7..c370ec5a3f 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -96,4 +96,20 @@ echo "Submitted batch job $job_id" # Use resilient monitoring instead of sbatch -W SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" +monitor_exit=0 +bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? + +if [ "$monitor_exit" -ne 0 ]; then + echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." + # Give the SLURM epilog time to finalize if the job just finished + sleep 30 + final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") + final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") + echo "Final SLURM state=$final_state exit=$final_exit" + if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then + echo "SLURM job $job_id completed successfully despite monitor failure — continuing." + else + echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)" + exit 1 + fi +fi From 61924d8a67a59ab918de259e72abb039a8b442eb Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 14:28:40 -0500 Subject: [PATCH 17/27] Extract monitor SIGKILL recovery into shared run_monitored_slurm_job.sh All three submit.sh scripts (phoenix, frontier, frontier_amd symlink) now call a single helper that wraps monitor_slurm_job.sh with sacct fallback: if the monitor is killed before the SLURM job completes, the helper re-checks the job's final state and exits 0 if it succeeded. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/run_monitored_slurm_job.sh | 37 ++++++++++++++++++++++ .github/workflows/frontier/submit.sh | 3 +- .github/workflows/phoenix/submit.sh | 19 +---------- 3 files changed, 39 insertions(+), 20 deletions(-) create mode 100644 .github/scripts/run_monitored_slurm_job.sh diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh new file mode 100644 index 0000000000..905520c45e --- /dev/null +++ b/.github/scripts/run_monitored_slurm_job.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL +# from the runner OS) before the SLURM job completes. When the monitor exits +# non-zero, sacct is used to verify the job's actual final state; if the SLURM +# job succeeded we exit 0 so the CI step is not falsely marked as failed. +# +# Usage: run_monitored_slurm_job.sh + +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +job_id="$1" +output_file="$2" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +monitor_exit=0 +bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? + +if [ "$monitor_exit" -ne 0 ]; then + echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." + # Give the SLURM epilog time to finalize if the job just finished + sleep 30 + final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") + final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") + echo "Final SLURM state=$final_state exit=$final_exit" + if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then + echo "SLURM job $job_id completed successfully despite monitor failure — continuing." + else + echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)" + exit 1 + fi +fi diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 8b914db03e..4b472cd433 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -102,5 +102,4 @@ fi echo "Submitted batch job $job_id" -# Use resilient monitoring instead of sbatch -W -bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" +bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index c370ec5a3f..786489d1c4 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -94,22 +94,5 @@ fi echo "Submitted batch job $job_id" -# Use resilient monitoring instead of sbatch -W SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -monitor_exit=0 -bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? - -if [ "$monitor_exit" -ne 0 ]; then - echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." - # Give the SLURM epilog time to finalize if the job just finished - sleep 30 - final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") - final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") - echo "Final SLURM state=$final_state exit=$final_exit" - if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then - echo "SLURM job $job_id completed successfully despite monitor failure — continuing." - else - echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)" - exit 1 - fi -fi +bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file" From ac28127cf51cf117bd853abc09280fb9d6875174 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 7 Mar 2026 03:47:46 -0500 Subject: [PATCH 18/27] bench: update Phoenix tmpbuild path to project storage --- .github/workflows/phoenix/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index e91ece366b..10a38d0eea 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -2,7 +2,7 @@ source .github/scripts/bench-preamble.sh -tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build +tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build currentdir=$tmpbuild/run-$(( RANDOM % 900 )) mkdir -p $tmpbuild mkdir -p $currentdir From 8db880774a29a16a57b9a20be0c6c70685d0e965 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 7 Mar 2026 18:18:27 -0500 Subject: [PATCH 19/27] Re-enable Phoenix NVHPC and Frontier AMD in CI workflows Restore all runners that were temporarily disabled to speed up initial CI validation of the CCE fix. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 25 ++++++++++++++++- .github/workflows/test.yml | 53 ++++++++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b9cad5e07f..b45fc45e40 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -37,7 +37,30 @@ jobs: fail-fast: false matrix: include: - # Phoenix (NVHPC) — TEMPORARILY DISABLED (QOS job limit / pre-existing SLURM failures) + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: cpu + interface: none + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: acc + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: omp + build_script: "" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b79886a956..5dd072072d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -170,7 +170,22 @@ jobs: strategy: matrix: include: - # Phoenix (GT) — TEMPORARILY DISABLED (pre-existing SLURM/Case Opt failures) + # Phoenix (GT) — build+test combined in SLURM job + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'acc' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'omp' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'cpu' + interface: 'none' # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition - runner: 'frontier' cluster: 'frontier' @@ -201,7 +216,24 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' - # Frontier AMD — TEMPORARILY DISABLED (pre-existing failures unrelated to CCE fix) + # Frontier AMD — build on login node, GPU tests sharded for batch partition + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' + shard: '1/2' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' + shard: '2/2' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'cpu' + interface: 'none' runs-on: group: phoenix labels: ${{ matrix.runner }} @@ -257,8 +289,16 @@ jobs: strategy: matrix: include: - # Phoenix (GT) — TEMPORARILY DISABLED (pre-existing SLURM/Case Opt failures) - # Frontier AMD — TEMPORARILY DISABLED (pre-existing failures unrelated to CCE fix) + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'acc' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'omp' - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -269,6 +309,11 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'gpu' interface: 'omp' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' runs-on: group: phoenix labels: ${{ matrix.runner }} From 4e6482d281860e425d8062441077c45caf5aae0d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 7 Mar 2026 18:32:28 -0500 Subject: [PATCH 20/27] Remove Phoenix cpu from bench matrix Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b45fc45e40..16b3e0b511 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -37,14 +37,6 @@ jobs: fail-fast: false matrix: include: - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: cpu - interface: none - build_script: "" - cluster: phoenix name: Georgia Tech | Phoenix (NVHPC) group: phoenix From 878fddb5a30e0ec7e0323472a3ee65e0f84f8b8d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 13:06:00 -0400 Subject: [PATCH 21/27] Remove CCE VLA guard from m_chemistry.fpp; slim CI to Frontier (CCE) only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1,500+ stress-test rounds on CCE 19.0.0 showed zero ICEs with plain dimension(num_species) local arrays in m_chemistry.fpp. Remove all #:if USING_CCE fixed-size array guards, the CCE_MAX_SPECIES Fypp constant, the @:PROHIBIT runtime checks, and the matching Python-side species-count validation in input.py. Simplify the compound AMD guard from (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE to not MFC_CASE_OPTIMIZATION and USING_AMD with a literal dimension(10) — the AMD workaround is preserved. The -Oipa0 per-file CMake flags for m_bubbles_EL and m_phase_change are kept; those ICEs are confirmed required by 20/20 positive-control rounds and GitHub CI history. Temporarily remove non-CCE jobs from CI (GitHub runners, Phoenix, Frontier-AMD) to focus test bandwidth on the CCE fix branch. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 25 +------ .github/workflows/test.yml | 145 +----------------------------------- .gitignore | 6 +- src/common/m_chemistry.fpp | 48 +++--------- toolchain/mfc/run/input.py | 14 ---- 5 files changed, 18 insertions(+), 220 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 16b3e0b511..103d464087 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -37,22 +37,7 @@ jobs: fail-fast: false matrix: include: - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: gpu - interface: acc - build_script: "" - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: gpu - interface: omp - build_script: "" + # Frontier (ORNL) — CCE only - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -69,14 +54,6 @@ jobs: device: gpu interface: omp build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" - - cluster: frontier_amd - name: Oak Ridge | Frontier (AMD) - group: phoenix - labels: frontier - flag: famd - device: gpu - interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5dd072072d..025a019623 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -68,99 +68,6 @@ jobs: with: filters: ".github/file-filter.yml" - github: - name: Github - if: needs.file-changes.outputs.checkall == 'true' - needs: [lint-gate, file-changes] - strategy: - matrix: - os: ['ubuntu', 'macos'] - mpi: ['mpi'] - precision: [''] - debug: ['debug', 'no-debug'] - intel: [true, false] - exclude: - - os: macos - intel: true - - include: - - os: ubuntu - mpi: no-mpi - precision: single - debug: no-debug - intel: false - - fail-fast: false - continue-on-error: true - runs-on: ${{ matrix.os }}-latest - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: Setup MacOS - if: matrix.os == 'macos' - run: | - brew update - brew upgrade - brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack - echo "FC=gfortran-15" >> $GITHUB_ENV - echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV - - - name: Setup Ubuntu - if: matrix.os == 'ubuntu' && matrix.intel == false - run: | - sudo apt update -y - sudo apt install -y cmake gcc g++ python3 python3-dev hdf5-tools \ - libfftw3-dev libhdf5-dev openmpi-bin libopenmpi-dev \ - libblas-dev liblapack-dev - - - name: Setup Ubuntu (Intel) - if: matrix.os == 'ubuntu' && matrix.intel == true - run: | - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - sudo apt-get update - sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-mpi intel-oneapi-mpi-devel - # Export only new/changed env vars from setvars.sh. - # `printenv >> $GITHUB_ENV` dumps all vars including shell internals - # with special characters that corrupt GITHUB_ENV parsing. - printenv | sort > /tmp/env_before - source /opt/intel/oneapi/setvars.sh - printenv | sort > /tmp/env_after - diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV - - - name: Get system info for cache key - id: sys-info - run: | - { - uname -m - cat /proc/cpuinfo 2>/dev/null | grep 'model name' | head -1 || sysctl -n machdep.cpu.brand_string 2>/dev/null || true - if command -v ifx &>/dev/null; then ifx --version 2>/dev/null | head -1; else ${FC:-gfortran} --version 2>/dev/null | head -1 || true; fi - ${CC:-gcc} --version 2>/dev/null | head -1 || true - } | (sha256sum 2>/dev/null || shasum -a 256) | cut -c1-16 > /tmp/sys-hash - echo "sys-hash=$(cat /tmp/sys-hash)" >> "$GITHUB_OUTPUT" - - - name: Restore Build Cache - uses: actions/cache@v4 - with: - path: build - key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ steps.sys-info.outputs.sys-hash }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} - - - name: Build - run: | - /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} $PRECISION $TEST_ALL - env: - TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} - PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} - - - name: Test - run: bash .github/scripts/run-tests-with-retry.sh -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT - env: - TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} - TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }} - self: name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})" if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true @@ -170,23 +77,7 @@ jobs: strategy: matrix: include: - # Phoenix (GT) — build+test combined in SLURM job - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'acc' - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'omp' - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'cpu' - interface: 'none' - # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition + # Frontier (ORNL) — CCE only - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -216,24 +107,6 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' - # Frontier AMD — build on login node, GPU tests sharded for batch partition - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' - shard: '1/2' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' - shard: '2/2' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'cpu' - interface: 'none' runs-on: group: phoenix labels: ${{ matrix.runner }} @@ -289,16 +162,7 @@ jobs: strategy: matrix: include: - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'acc' - - runner: 'gt' - cluster: 'phoenix' - cluster_name: 'Georgia Tech | Phoenix' - device: 'gpu' - interface: 'omp' + # Frontier (ORNL) — CCE only - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -309,11 +173,6 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'gpu' interface: 'omp' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' runs-on: group: phoenix labels: ${{ matrix.runner }} diff --git a/.gitignore b/.gitignore index e80d14a6f9..f831e1fcfe 100644 --- a/.gitignore +++ b/.gitignore @@ -105,4 +105,8 @@ benchmarks/*.png *.avi **isolation_rules/ -**.supercode/ \ No newline at end of file +**.supercode/ +# CCE stress-test log directories (local testing artifacts) +cce_*/ +cce_*.log +run_cce_*.sh diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index 15228fddc7..d7ffbc3cfe 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -6,10 +6,6 @@ #:include 'macros.fpp' #:include 'case.fpp' -#! CCE 19.0.0 workaround: fixed-size array limit for local species arrays under _CRAYFTN. -#! Must match the Python-side check in toolchain/mfc/run/input.py. See PR #1286. -#:set CCE_MAX_SPECIES = 10 - !> @brief Multi-species chemistry interface for thermodynamic properties, reaction rates, and transport coefficients module m_chemistry @@ -67,15 +63,7 @@ contains integer :: x, y, z, eqn real(wp) :: energy, T_in - #:if USING_CCE - real(wp), dimension(${CCE_MAX_SPECIES}$) :: Ys - #:else - real(wp), dimension(num_species) :: Ys - #:endif - - #:if USING_CCE - @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") - #:endif + real(wp), dimension(num_species) :: Ys do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end @@ -113,17 +101,9 @@ contains type(int_bounds_info), dimension(1:3), intent(in) :: bounds integer :: x, y, z, i - #:if USING_CCE - real(wp), dimension(${CCE_MAX_SPECIES}$) :: Ys - #:else - real(wp), dimension(num_species) :: Ys - #:endif + real(wp), dimension(num_species) :: Ys real(wp) :: mix_mol_weight - #:if USING_CCE - @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") - #:endif - do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end do x = bounds(1)%beg, bounds(1)%end @@ -151,18 +131,14 @@ contains integer :: eqn real(wp) :: T real(wp) :: rho, omega_m - #:if (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE - real(wp), dimension(${CCE_MAX_SPECIES}$) :: Ys - real(wp), dimension(${CCE_MAX_SPECIES}$) :: omega + #:if not MFC_CASE_OPTIMIZATION and USING_AMD + real(wp), dimension(10) :: Ys + real(wp), dimension(10) :: omega #:else real(wp), dimension(num_species) :: Ys real(wp), dimension(num_species) :: omega #:endif - #:if USING_CCE - @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") - #:endif - $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]') do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end @@ -204,11 +180,11 @@ contains type(int_bounds_info), intent(in) :: irx, iry, irz integer, intent(in) :: idir - #:if (not MFC_CASE_OPTIMIZATION and USING_AMD) or USING_CCE - real(wp), dimension(${CCE_MAX_SPECIES}$) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell - real(wp), dimension(${CCE_MAX_SPECIES}$) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 - real(wp), dimension(${CCE_MAX_SPECIES}$) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k - real(wp), dimension(${CCE_MAX_SPECIES}$) :: Mass_Diffu_Flux, dYk_dxi + #:if not MFC_CASE_OPTIMIZATION and USING_AMD + real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell + real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 + real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k + real(wp), dimension(10) :: Mass_Diffu_Flux, dYk_dxi #:else real(wp), dimension(num_species) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell real(wp), dimension(num_species) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2 @@ -226,10 +202,6 @@ contains integer :: x, y, z, i, n, eqn integer, dimension(3) :: offsets - #:if USING_CCE - @:PROHIBIT(num_species > ${CCE_MAX_SPECIES}$, "CCE 19.0.0 workaround: num_species must be <= ${CCE_MAX_SPECIES}$ (fixed-size arrays in m_chemistry.fpp)") - #:endif - isc1 = irx; isc2 = iry; isc3 = irz $:GPU_UPDATE(device='[isc1,isc2,isc3]') diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index bdd476365c..cfdc24647c 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -92,20 +92,6 @@ def generate_fpp(self, target) -> None: # Write the generated Fortran code to the m_thermochem.f90 file with the chosen precision sol = self.get_cantera_solution() - # CCE 19.0.0 workaround: m_chemistry.fpp uses dimension(CCE_MAX_SPECIES) for local - # species arrays on Cray builds to avoid an InstCombine ICE. Must match the Fypp - # constant CCE_MAX_SPECIES in src/common/m_chemistry.fpp. - CCE_MAX_SPECIES = 10 - if sol.n_species > CCE_MAX_SPECIES: - msg = (f"Cantera mechanism has {sol.n_species} species > {CCE_MAX_SPECIES}. " - f"Cray Fortran (CCE) builds use a hardcoded dimension({CCE_MAX_SPECIES}) " - "workaround in m_chemistry.fpp and will abort at runtime on CCE. See PR #1286.") - if directive_str is not None: - # GPU builds: hard error — the Fortran PROHIBIT will abort anyway, - # so fail early at input generation rather than at the first chemistry call. - raise common.MFCException(msg) - cons.print(f"[bold yellow]Warning:[/bold yellow] {msg}") - thermochem_code = pyro.FortranCodeGenerator().generate( "m_thermochem", sol, From 86f864d7a1421d49417b16b7712d4ef3baa758fb Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 15:54:00 -0400 Subject: [PATCH 22/27] Restore GitHub, Phoenix, and Frontier AMD CI/bench jobs Re-add the jobs that were temporarily removed for CCE testing: - test.yml: github job (gfortran + Intel ifx), Phoenix matrix entries, Frontier AMD matrix entries - bench.yml: Phoenix NVHPC entries, Frontier AMD entry Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 33 ++++++++++- .github/workflows/test.yml | 111 +++++++++++++++++++++++++++++++++++- 2 files changed, 142 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 103d464087..b45fc45e40 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -37,7 +37,30 @@ jobs: fail-fast: false matrix: include: - # Frontier (ORNL) — CCE only + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: cpu + interface: none + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: acc + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: omp + build_script: "" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -54,6 +77,14 @@ jobs: device: gpu interface: omp build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" + - cluster: frontier_amd + name: Oak Ridge | Frontier (AMD) + group: phoenix + labels: frontier + flag: famd + device: gpu + interface: omp + build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 025a019623..dc6a5664b4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -68,6 +68,88 @@ jobs: with: filters: ".github/file-filter.yml" + github: + name: Github + if: needs.file-changes.outputs.checkall == 'true' + needs: [lint-gate, file-changes] + strategy: + matrix: + os: ['ubuntu', 'macos'] + mpi: ['mpi'] + precision: [''] + debug: ['debug', 'no-debug'] + intel: [true, false] + exclude: + - os: macos + intel: true + + include: + - os: ubuntu + mpi: no-mpi + precision: single + debug: no-debug + intel: false + + fail-fast: false + continue-on-error: true + runs-on: ${{ matrix.os }}-latest + + steps: + - name: Clone + uses: actions/checkout@v4 + + - name: Restore Build Cache + uses: actions/cache@v4 + with: + path: build + key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }} + + - name: Setup MacOS + if: matrix.os == 'macos' + run: | + brew update + brew upgrade + brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack + echo "FC=gfortran-15" >> $GITHUB_ENV + echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV + + - name: Setup Ubuntu + if: matrix.os == 'ubuntu' && matrix.intel == false + run: | + sudo apt update -y + sudo apt install -y cmake gcc g++ python3 python3-dev hdf5-tools \ + libfftw3-dev libhdf5-dev openmpi-bin libopenmpi-dev \ + libblas-dev liblapack-dev + + - name: Setup Ubuntu (Intel) + if: matrix.os == 'ubuntu' && matrix.intel == true + run: | + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + sudo apt-get update + sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-mpi intel-oneapi-mpi-devel + source /opt/intel/oneapi/setvars.sh + printenv >> $GITHUB_ENV + + - name: Set up Python 3.14 + uses: actions/setup-python@v5 + with: + python-version: '3.14' + + - name: Build + run: | + /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL + env: + TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} + + - name: Test + run: | + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT + env: + TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} + TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }} + self: name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})" if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true @@ -77,7 +159,23 @@ jobs: strategy: matrix: include: - # Frontier (ORNL) — CCE only + # Phoenix (GT) — build+test combined in SLURM job + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'acc' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'omp' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'cpu' + interface: 'none' + # Frontier (ORNL) — CCE - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -107,6 +205,17 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' + # Frontier AMD — build on login node, test via SLURM + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'cpu' + interface: 'none' runs-on: group: phoenix labels: ${{ matrix.runner }} From c17653f1bf9068a6a67c9734df11dcd1f2a53abf Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 16:07:01 -0400 Subject: [PATCH 23/27] toolchain: log warning when pyrometheus CCE workaround patch applies When the Cray+ACC GPU_ROUTINE macro patch successfully replaces the broken _CRAYFTN/#ifdef block in pyrometheus-generated m_thermochem.f90, emit a yellow warning so engineers can tell the workaround is active and will notice when pyrometheus eventually fixes the upstream issue. Co-Authored-By: Claude Sonnet 4.6 --- toolchain/mfc/run/input.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index cfdc24647c..c594ad17da 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -119,6 +119,10 @@ def generate_fpp(self, target) -> None: "Cray+ACC GPU_ROUTINE macro patch did not apply. " "Update the pattern in toolchain/mfc/run/input.py." ) + else: + cons.print("[yellow]Warning: Applied CCE 19.0.0 workaround patch to pyrometheus-generated " + "m_thermochem.f90 (replaced _CRAYFTN GPU_ROUTINE macro with !$acc routine seq). " + "Remove this patch once pyrometheus emits correct Cray+ACC directives upstream.[/yellow]") thermochem_code = patched common.file_write( From 24ea0cbc2e24f217f25a0e7d9ec16d7f09457dd0 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 16:17:51 -0400 Subject: [PATCH 24/27] ci: restore diff-based Intel oneAPI env export to avoid GITHUB_ENV corruption Plain `printenv >> $GITHUB_ENV` after sourcing setvars.sh can corrupt GITHUB_ENV parsing due to shell internals with special characters. Restore the diff-based filter from 2c3590cd that was accidentally clobbered by 878fddb5 when slimming CI for CCE testing. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/test.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dc6a5664b4..01c19da375 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -129,8 +129,13 @@ jobs: sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" sudo apt-get update sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-mpi intel-oneapi-mpi-devel + # Export only new/changed env vars from setvars.sh. + # `printenv >> $GITHUB_ENV` dumps all vars including shell internals + # with special characters that corrupt GITHUB_ENV parsing. + printenv | sort > /tmp/env_before source /opt/intel/oneapi/setvars.sh - printenv >> $GITHUB_ENV + printenv | sort > /tmp/env_after + diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV - name: Set up Python 3.14 uses: actions/setup-python@v5 From f8818887ca8430b02e834d4bedde326293e03c0f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 16:39:08 -0400 Subject: [PATCH 25/27] Restore full case-optimization matrix and Frontier AMD sharding - case-optimization: restore Phoenix (acc+omp) and Frontier AMD (omp) entries that were incorrectly narrowed to CCE-only - self: restore Frontier AMD GPU OMP sharding (1/2, 2/2) that was unintentionally removed Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/test.yml | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 01c19da375..f62e86de10 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -210,12 +210,19 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' - # Frontier AMD — build on login node, test via SLURM + # Frontier AMD — build on login node, GPU tests sharded for batch partition - runner: 'frontier' cluster: 'frontier_amd' cluster_name: 'Oak Ridge | Frontier (AMD)' device: 'gpu' interface: 'omp' + shard: '1/2' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' + shard: '2/2' - runner: 'frontier' cluster: 'frontier_amd' cluster_name: 'Oak Ridge | Frontier (AMD)' @@ -276,7 +283,16 @@ jobs: strategy: matrix: include: - # Frontier (ORNL) — CCE only + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'acc' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'omp' - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -287,6 +303,11 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'gpu' interface: 'omp' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' runs-on: group: phoenix labels: ${{ matrix.runner }} From fd06e97b5c1b076b6ceb79b919ab73b9e2af999e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 16:50:13 -0400 Subject: [PATCH 26/27] Adopt CI improvements from PR #1295 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - self/case-optimization: clean: true on checkout - self: build retry max_attempts 3→2, on_retry rm -rf build - self/case-optimization: add Cancel SLURM Jobs step on cancellation - case-optimization: drop retry wrapper on Pre-Build (login node) - CMakeLists.txt: skip -march=native for gcov builds; add -mno-avx512fp16 to avoid binutils <2.38 assembler failures on Granite Rapids CPUs Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/test.yml | 34 +++++++++++++++++++++++----------- CMakeLists.txt | 25 ++++++++++++++++++------- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f62e86de10..49934c6973 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -237,21 +237,30 @@ jobs: - name: Clone uses: actions/checkout@v4 with: - clean: false + clean: true - name: Build if: matrix.cluster != 'phoenix' uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3 with: - max_attempts: 3 + max_attempts: 2 retry_wait_seconds: 60 timeout_minutes: 60 command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - on_retry_command: ./mfc.sh clean + on_retry_command: rm -rf build - name: Test run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }} + - name: Cancel SLURM Jobs + if: cancelled() + run: | + find . -name "*.slurm_job_id" | while read -r f; do + job_id=$(cat "$f") + echo "Cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + done + - name: Compute Log Slug if: always() id: log @@ -315,7 +324,7 @@ jobs: - name: Clone uses: actions/checkout@v4 with: - clean: false + clean: true - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' @@ -323,17 +332,20 @@ jobs: - name: Pre-Build (login node) if: matrix.cluster != 'phoenix' - uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3 - with: - max_attempts: 3 - retry_wait_seconds: 60 - timeout_minutes: 120 - command: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} - on_retry_command: ./mfc.sh clean + run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} - name: Run Case-Optimization Tests run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + - name: Cancel SLURM Jobs + if: cancelled() + run: | + find . -name "*.slurm_job_id" | while read -r f; do + job_id=$(cat "$f") + echo "Cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + done + - name: Print Logs if: always() run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index a3aa9a1444..15c55af8df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,13 +224,24 @@ endif() if (CMAKE_BUILD_TYPE STREQUAL "Release") # Processor tuning: Check if we can target the host's native CPU's ISA. - CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE) - if (SUPPORTS_MARCH_NATIVE) - add_compile_options($<$:-march=native>) - else() - CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE) - if (SUPPORTS_MCPU_NATIVE) - add_compile_options($<$:-mcpu=native>) + # Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids) + # can emit instructions the system assembler doesn't support. + if (NOT MFC_GCov) + CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE) + if (SUPPORTS_MARCH_NATIVE) + add_compile_options($<$:-march=native>) + # Disable AVX-512 FP16: gfortran >=12 emits vmovw instructions on + # Granite Rapids CPUs, but binutils <2.38 cannot assemble them. + # FP16 is unused in MFC's double-precision computations. + CHECK_FORTRAN_COMPILER_FLAG("-mno-avx512fp16" SUPPORTS_MNO_AVX512FP16) + if (SUPPORTS_MNO_AVX512FP16) + add_compile_options($<$:-mno-avx512fp16>) + endif() + else() + CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE) + if (SUPPORTS_MCPU_NATIVE) + add_compile_options($<$:-mcpu=native>) + endif() endif() endif() From 917cdd55c8f437f7e494977a48fbca32d04ac37f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 8 Mar 2026 17:15:15 -0400 Subject: [PATCH 27/27] Re-trigger CI