From 0d995d529cdf19e88d437ef208f2c37ff11ef458 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 13 Jun 2026 15:46:46 -0400
Subject: [PATCH 1/3] Fix silent viscosity loss on AMD flang GPU (host-capture
 Re_size; complete Riemann private lists)

AMD flang reads the static declare-target Re_size stale across translation units, silently disabling viscosity. Host-capture it into Re_size_loc and firstprivate it into the Riemann kernels for all compilers. That firstprivate clause exposed a latent bug: several per-cell scalars (s_M/s_P/xi_M/xi_P in HLL, c_sum_Yi_Phi/flux_ene_e in HLLC, Gamm_L/Gamm_R/flux_tau_L/flux_tau_R in LF) were omitted from private() and relied on CCE-19's defaultmap(firstprivate:scalar) auto-privatization, which the firstprivate clause disrupts (gross wrong physics on Cray). Complete the private lists -- a no-op on every compiler -- so firstprivate is safe everywhere; no compiler gate needed. Verified on Frontier: AMD-flang 29/29 and Cray-CCE (rs=1/2/5) of the previously-failing cases pass.
---
 src/simulation/m_riemann_solver_hll.fpp  | 19 ++++++-----
 src/simulation/m_riemann_solver_hllc.fpp | 42 +++++++++++++-----------
 src/simulation/m_riemann_solver_lf.fpp   | 29 ++++++++--------
 3 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/src/simulation/m_riemann_solver_hll.fpp b/src/simulation/m_riemann_solver_hll.fpp
index e235a1c0e4..64dea81ff6 100644
--- a/src/simulation/m_riemann_solver_hll.fpp
+++ b/src/simulation/m_riemann_solver_hll.fpp
@@ -89,14 +89,15 @@ contains
         real(wp)                  :: vel_L_tmp, vel_R_tmp
         real(wp)                  :: Ms_L, Ms_R, pres_SL, pres_SR
         real(wp)                  :: alpha_L_sum, alpha_R_sum
-        real(wp)                  :: zcoef, pcorr   !< low Mach number correction
+        real(wp)                  :: zcoef, pcorr  !< low Mach number correction
         type(riemann_states)      :: c_fast, pres_mag
         type(riemann_states_vec3) :: B
-        type(riemann_states)      :: Ga             !< Gamma (Lorentz factor)
+        type(riemann_states)      :: Ga  !< Gamma (Lorentz factor)
         type(riemann_states)      :: vdotB, B2
-        type(riemann_states_vec3) :: b4             !< 4-magnetic field components (spatial: b4x, b4y, b4z)
-        type(riemann_states_vec3) :: cm             !< Conservative momentum variables
+        type(riemann_states_vec3) :: b4  !< 4-magnetic field components (spatial: b4x, b4y, b4z)
+        type(riemann_states_vec3) :: cm  !< Conservative momentum variables
         integer                   :: i, j, k, l, q  !< Generic loop iterators
+        integer, dimension(2)     :: Re_size_loc  !< host copy of Re_size; amdflang reads the declare-target original stale cross-TU
         ! Populating the buffers of the left and right Riemann problem states variables, based on the choice of boundary conditions
 
         call s_populate_riemann_states_variables_buffers(qL_prim_rsx_vf, dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, &
@@ -104,6 +105,7 @@ contains
 
         ! Reshaping inputted data based on dimensional splitting direction
         call s_initialize_riemann_solver(flux_src_vf, norm_dir)
+        Re_size_loc = Re_size
         #:for NORM_DIR, XYZ, STENCIL_VAR, COORDS, X_BND, Y_BND, Z_BND in &
                     [(1, 'x', 'j', '{STENCIL_IDX}, k, l', 'is1', 'is2', 'is3'), &
                      (2, 'y', 'k', 'j, {STENCIL_IDX}, l', 'is2', 'is1', 'is3'), &
@@ -119,7 +121,8 @@ contains
                                     & Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, gamma_L, &
                                     & gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, &
                                     & gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, Ms_L, Ms_R, pres_SL, &
-                                    & pres_SR, alpha_L_sum, alpha_R_sum, flux_tau_L, flux_tau_R]', copyin='[norm_dir]')
+                                    & pres_SR, alpha_L_sum, alpha_R_sum, flux_tau_L, flux_tau_R, s_M, s_P, xi_M, xi_P]', &
+                                    & copyin='[norm_dir]', firstprivate='[Re_size_loc]')
                 do l = ${Z_BND}$%beg, ${Z_BND}$%end
                     do k = ${Y_BND}$%beg, ${Y_BND}$%end
                         do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -216,11 +219,11 @@ contains
                                     Re_L(i) = dflt_real
                                     Re_R(i) = dflt_real
 
-                                    if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                    if (Re_size(i) > 0) Re_R(i) = 0._wp
+                                    if (Re_size_loc(i) > 0) Re_L(i) = 0._wp
+                                    if (Re_size_loc(i) > 0) Re_R(i) = 0._wp
 
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do q = 1, Re_size(i)
+                                    do q = 1, Re_size_loc(i)
                                         Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i)
                                         Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i)
                                     end do
diff --git a/src/simulation/m_riemann_solver_hllc.fpp b/src/simulation/m_riemann_solver_hllc.fpp
index f8ed19379e..f872833c9c 100644
--- a/src/simulation/m_riemann_solver_hllc.fpp
+++ b/src/simulation/m_riemann_solver_hllc.fpp
@@ -111,14 +111,15 @@ contains
         #:else
             real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
         #:endif
-        real(wp) :: G_L, G_R
-        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
-        real(wp) :: vel_L_tmp, vel_R_tmp
-        real(wp) :: rho_Star, E_Star, p_Star, p_K_Star, vel_K_star
-        real(wp) :: pres_SL, pres_SR, Ms_L, Ms_R
-        real(wp) :: flux_ene_e
-        real(wp) :: zcoef, pcorr           !< low Mach number correction
-        integer  :: Re_max, i, j, k, l, q  !< Generic loop iterators
+        real(wp)              :: G_L, G_R
+        real(wp)              :: vel_L_rms, vel_R_rms, vel_avg_rms
+        real(wp)              :: vel_L_tmp, vel_R_tmp
+        real(wp)              :: rho_Star, E_Star, p_Star, p_K_Star, vel_K_star
+        real(wp)              :: pres_SL, pres_SR, Ms_L, Ms_R
+        real(wp)              :: flux_ene_e
+        real(wp)              :: zcoef, pcorr  !< low Mach number correction
+        integer               :: Re_max, i, j, k, l, q  !< Generic loop iterators
+        integer, dimension(2) :: Re_size_loc  !< host copy of Re_size; amdflang reads the declare-target original stale cross-TU
         ! Populating the buffers of the left and right Riemann problem states variables, based on the choice of boundary conditions
 
         call s_populate_riemann_states_variables_buffers(qL_prim_rsx_vf, dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, &
@@ -127,6 +128,7 @@ contains
         ! Reshaping inputted data based on dimensional splitting direction
 
         call s_initialize_riemann_solver(flux_src_vf, norm_dir)
+        Re_size_loc = Re_size
 
         #:for NORM_DIR, XYZ, STENCIL_VAR, COORDS, X_BND, Y_BND, Z_BND in &
                     [(1, 'x', 'j', '{STENCIL_IDX}, k, l', 'is1', 'is2', 'is3'), &
@@ -147,7 +149,7 @@ contains
                                         & rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, &
                                         & vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, &
                                         & alpha_R_sum, rho_Star, E_Star, p_Star, p_K_Star, vel_K_star, s_L, s_R, s_M, s_P, s_S, &
-                                        & xi_M, xi_P, xi_L, xi_R, xi_L_m1, xi_R_m1, xi_MP, xi_PP]')
+                                        & xi_M, xi_P, xi_L, xi_R, xi_L_m1, xi_R_m1, xi_MP, xi_PP]', firstprivate='[Re_size_loc]')
                     do l = ${Z_BND}$%beg, ${Z_BND}$%end
                         do k = ${Y_BND}$%beg, ${Y_BND}$%end
                             do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -229,10 +231,10 @@ contains
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
                                         Re_R(i) = dflt_real
-                                        if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                        if (Re_size(i) > 0) Re_R(i) = 0._wp
+                                        if (Re_size_loc(i) > 0) Re_L(i) = 0._wp
+                                        if (Re_size_loc(i) > 0) Re_R(i) = 0._wp
                                         $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
+                                        do q = 1, Re_size_loc(i)
                                             Re_L(i) = qL_prim_rsx_vf(${SF('')}$, eqn_idx%E + Re_idx(i, q))/Res_gs(i, q) + Re_L(i)
                                             Re_R(i) = qR_prim_rsx_vf(${SF(' + 1')}$, eqn_idx%E + Re_idx(i, q))/Res_gs(i, &
                                                  & q) + Re_R(i)
@@ -782,7 +784,7 @@ contains
                                         & Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, s_L, s_R, s_M, s_P, s_S, xi_M, &
                                         & xi_P, xi_L, xi_R, xi_L_m1, xi_R_m1, xi_MP, xi_PP, nbub_L, nbub_R, PbwR3Lbar, PbwR3Rbar, &
                                         & R3Lbar, R3Rbar, R3V2Lbar, R3V2Rbar, Ys_L, Ys_R, Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, &
-                                        & Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2]')
+                                        & Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2]', firstprivate='[Re_size_loc]')
                     do l = ${Z_BND}$%beg, ${Z_BND}$%end
                         do k = ${Y_BND}$%beg, ${Y_BND}$%end
                             do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -851,11 +853,11 @@ contains
                                             Re_L(i) = dflt_real
                                             Re_R(i) = dflt_real
 
-                                            if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                            if (Re_size(i) > 0) Re_R(i) = 0._wp
+                                            if (Re_size_loc(i) > 0) Re_L(i) = 0._wp
+                                            if (Re_size_loc(i) > 0) Re_R(i) = 0._wp
 
                                             $:GPU_LOOP(parallelism='[seq]')
-                                            do q = 1, Re_size(i)
+                                            do q = 1, Re_size_loc(i)
                                                 Re_L(i) = (1._wp - qL_prim_rsx_vf(${SF('')}$, eqn_idx%E + Re_idx(i, &
                                                      & q)))/Res_gs(i, q) + Re_L(i)
                                                 Re_R(i) = (1._wp - qR_prim_rsx_vf(${SF(' + 1')}$, eqn_idx%E + Re_idx(i, &
@@ -1179,7 +1181,7 @@ contains
                                         & vel_R, Re_L, Re_R, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef, &
                                         & vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, &
                                         & tau_e_L, tau_e_R, xi_field_L, xi_field_R, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, G_L, &
-                                        & G_R]', copyin='[is1, is2, is3]')
+                                        & G_R, c_sum_Yi_Phi, flux_ene_e]', copyin='[is1, is2, is3]', firstprivate='[Re_size_loc]')
                     do l = ${Z_BND}$%beg, ${Z_BND}$%end
                         do k = ${Y_BND}$%beg, ${Y_BND}$%end
                             do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -1244,8 +1246,8 @@ contains
                                 end do
 
                                 Re_max = 0
-                                if (Re_size(1) > 0) Re_max = 1
-                                if (Re_size(2) > 0) Re_max = 2
+                                if (Re_size_loc(1) > 0) Re_max = 1
+                                if (Re_size_loc(2) > 0) Re_max = 2
 
                                 if (viscous) then
                                     $:GPU_LOOP(parallelism='[seq]')
@@ -1254,7 +1256,7 @@ contains
                                         Re_R(i) = 0._wp
 
                                         $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
+                                        do q = 1, Re_size_loc(i)
                                             Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i)
                                             Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i)
                                         end do
diff --git a/src/simulation/m_riemann_solver_lf.fpp b/src/simulation/m_riemann_solver_lf.fpp
index c3c3aaa959..76cb017c59 100644
--- a/src/simulation/m_riemann_solver_lf.fpp
+++ b/src/simulation/m_riemann_solver_lf.fpp
@@ -88,15 +88,16 @@ contains
         real(wp)                  :: vel_L_tmp, vel_R_tmp
         real(wp)                  :: Ms_L, Ms_R, pres_SL, pres_SR
         real(wp)                  :: alpha_L_sum, alpha_R_sum
-        real(wp)                  :: zcoef, pcorr    !< low Mach number correction
+        real(wp)                  :: zcoef, pcorr  !< low Mach number correction
         type(riemann_states)      :: c_fast, pres_mag
         type(riemann_states_vec3) :: B
-        type(riemann_states)      :: Ga              !< Gamma (Lorentz factor)
+        type(riemann_states)      :: Ga  !< Gamma (Lorentz factor)
         type(riemann_states)      :: vdotB, B2
-        type(riemann_states_vec3) :: b4              !< 4-magnetic field components (spatial: b4x, b4y, b4z)
-        type(riemann_states_vec3) :: cm              !< Conservative momentum variables
-        integer                   :: i, j, k, l, q   !< Generic loop iterators
+        type(riemann_states_vec3) :: b4  !< 4-magnetic field components (spatial: b4x, b4y, b4z)
+        type(riemann_states_vec3) :: cm  !< Conservative momentum variables
+        integer                   :: i, j, k, l, q  !< Generic loop iterators
         integer, dimension(3)     :: idx_right_phys  !< Physical (j,k,l) indices for right state.
+        integer, dimension(2)     :: Re_size_loc  !< host copy of Re_size; amdflang reads the declare-target original stale cross-TU
         ! Populating the buffers of the left and right Riemann problem states variables, based on the choice of boundary conditions
 
         call s_populate_riemann_states_variables_buffers(qL_prim_rsx_vf, dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, &
@@ -104,6 +105,7 @@ contains
 
         ! Reshaping inputted data based on dimensional splitting direction
         call s_initialize_riemann_solver(flux_src_vf, norm_dir)
+        Re_size_loc = Re_size
         #:for NORM_DIR, XYZ, STENCIL_VAR, COORDS, X_BND, Y_BND, Z_BND in &
                     [(1, 'x', 'j', '{STENCIL_IDX}, k, l', 'is1', 'is2', 'is3'), &
                      (2, 'y', 'k', 'j, {STENCIL_IDX}, l', 'is2', 'is1', 'is3'), &
@@ -119,7 +121,8 @@ contains
                                     & vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, c_avg, pres_L, pres_R, &
                                     & rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, c_L, c_R, E_L, E_R, H_L, &
                                     & H_R, ptilde_L, ptilde_R, s_M, s_P, xi_M, xi_P, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, &
-                                    & Cp_L, Cp_R, Cv_L, Cv_R, R_gas_L, R_gas_R, MW_L, MW_R, T_L, T_R, Y_L, Y_R]')
+                                    & Cp_L, Cp_R, Cv_L, Cv_R, R_gas_L, R_gas_R, MW_L, MW_R, T_L, T_R, Y_L, Y_R, Gamm_L, Gamm_R, &
+                                    & flux_tau_L, flux_tau_R]', firstprivate='[Re_size_loc]')
                 do l = ${Z_BND}$%beg, ${Z_BND}$%end
                     do k = ${Y_BND}$%beg, ${Y_BND}$%end
                         do j = ${X_BND}$%beg, ${X_BND}$%end
@@ -216,11 +219,11 @@ contains
                                     Re_L(i) = dflt_real
                                     Re_R(i) = dflt_real
 
-                                    if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                    if (Re_size(i) > 0) Re_R(i) = 0._wp
+                                    if (Re_size_loc(i) > 0) Re_L(i) = 0._wp
+                                    if (Re_size_loc(i) > 0) Re_R(i) = 0._wp
 
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do q = 1, Re_size(i)
+                                    do q = 1, Re_size_loc(i)
                                         Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i)
                                         Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i)
                                     end do
@@ -599,7 +602,7 @@ contains
 
         if (viscous) then
             $:GPU_PARALLEL_LOOP(collapse=3, private='[i, j, k, l, idx_right_phys, vel_grad_L, vel_grad_R, alpha_L, alpha_R, &
-                                & vel_L, vel_R, Re_L, Re_R]', copyin='[norm_dir]')
+                                & vel_L, vel_R, Re_L, Re_R]', copyin='[norm_dir]', firstprivate='[Re_size_loc]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -650,11 +653,11 @@ contains
                             Re_L(i) = dflt_real
                             Re_R(i) = dflt_real
 
-                            if (Re_size(i) > 0) Re_L(i) = 0._wp
-                            if (Re_size(i) > 0) Re_R(i) = 0._wp
+                            if (Re_size_loc(i) > 0) Re_L(i) = 0._wp
+                            if (Re_size_loc(i) > 0) Re_R(i) = 0._wp
 
                             $:GPU_LOOP(parallelism='[seq]')
-                            do q = 1, Re_size(i)
+                            do q = 1, Re_size_loc(i)
                                 Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i)
                                 Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i)
                             end do

From a462123281c2476b78444219ac9826038a0296ee Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 13 Jun 2026 23:50:27 -0400
Subject: [PATCH 2/3] macros: fold long GPU directives across continuation
 lines for nvfortran

The host-capture firstprivate(Re_size_loc) added to the LF/HLL/HLLC solvers (the AMD-flang viscosity fix) pushed the longest GPU_PARALLEL_LOOP directive past nvfortran's ~1000-char source-line limit (1011 chars -> 'source line too long' / unbalanced parentheses; AMD and Cray accept the long line, nvfortran caps it). FOLD_DIRECTIVE wraps the assembled directive at whole-clause boundaries with repeated sentinels (!$acc&/!$omp&), so the longest emitted line is prefix+longest-single-clause (817 chars on LF) -- shorter than the single line a build with one fewer clause already compiles. firstprivate is preserved (AMD/Cray correctness) on its own continuation. Validated: nvfortran 25.5 -acc and -mp=gpu compile+run the folded marker-interleaved continuation correctly; the cpp line-markers between continuations are already emitted by master for regular Fortran continuations and compile on Cray/AMD CI. Short directives (<200 chars) are unchanged.
---
 src/common/include/acc_macros.fpp             |  8 +++---
 src/common/include/omp_macros.fpp             |  4 +--
 src/common/include/shared_parallel_macros.fpp | 25 +++++++++++++++++++
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/common/include/acc_macros.fpp b/src/common/include/acc_macros.fpp
index bd4284c01b..b91138f3b9 100644
--- a/src/common/include/acc_macros.fpp
+++ b/src/common/include/acc_macros.fpp
@@ -121,8 +121,8 @@
         & copyout_val.strip('\n') + create_val.strip('\n') + &
         & no_create_val.strip('\n') + present_val.strip('\n') + &
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
-    #:set acc_directive = '!$acc parallel ' + &
-        & acc_clause_val + extraAccArgs_val.strip('\n')
+    #:set acc_directive = FOLD_DIRECTIVE('!$acc parallel ' + &
+        & acc_clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n')
     #:set end_acc_directive = '!$acc end parallel'
     $:acc_directive
     $:code
@@ -153,8 +153,8 @@
         & copyout_val.strip('\n') + create_val.strip('\n') + &
         & no_create_val.strip('\n') + present_val.strip('\n') + &
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
-    #:set acc_directive = '!$acc parallel loop ' + &
-        & clause_val + extraAccArgs_val.strip('\n')
+    #:set acc_directive = FOLD_DIRECTIVE('!$acc parallel loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n')
     $:acc_directive
 #:enddef
 
diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp
index 7620e7607f..cdf49db773 100644
--- a/src/common/include/omp_macros.fpp
+++ b/src/common/include/omp_macros.fpp
@@ -141,7 +141,7 @@
         & deviceptr_val.strip('\n') + attach_val.strip('\n')
 
     #:set omp_clause_val = omp_clause_val.strip('\n')
-    #:set omp_directive = '!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n')
+    #:set omp_directive = FOLD_DIRECTIVE('!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n')
 
     #:set omp_end_directive = '!$omp end target teams'
     $:omp_directive
@@ -186,7 +186,7 @@
         #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
     #:endif
 
-    #:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n')
+    #:set omp_directive = FOLD_DIRECTIVE(omp_start_directive + clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n')
     $:omp_directive
 #:enddef
 
diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp
index 36bee0a23a..59bd15b8c7 100644
--- a/src/common/include/shared_parallel_macros.fpp
+++ b/src/common/include/shared_parallel_macros.fpp
@@ -117,4 +117,29 @@
     #:endif
     $:extraArgs_val
 #:enddef
+
+#:def FOLD_DIRECTIVE(directive, sentinel, width=200)
+    #! Fold a long GPU directive across free-form continuation lines so it stays
+    #! under nvfortran's ~1000-char source-line limit. Breaks only at whole-clause
+    #! boundaries (clause(...) groups and bare keywords), repeating the sentinel
+    #! (e.g. '!$acc&') on each continuation -- which fypp's --no-folding cannot do
+    #! because its generic folder omits the sentinel. Every emitted line is no
+    #! longer than the prefix plus the single longest clause, i.e. no longer than
+    #! the unfolded line a build with one fewer clause already compiles.
+    #:set _toks = re.findall(r'\w+\([^)]*\)|\S+', directive)
+    #:set _lines = []
+    #:set _cur = ''
+    #:for _t in _toks
+        #:if _cur == ''
+            #:set _cur = _t
+        #:elif len(_cur) + 1 + len(_t) > width
+            #:set _lines = _lines + [_cur + ' &']
+            #:set _cur = sentinel + '& ' + _t
+        #:else
+            #:set _cur = _cur + ' ' + _t
+        #:endif
+    #:endfor
+    #:set _lines = _lines + [_cur]
+    $:'\n'.join(_lines)
+#:enddef
 ! New line at end of file is required for FYPP

From ecf7f22e389b508dc826d9120832e3d4d288a64b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sun, 14 Jun 2026 00:36:05 -0400
Subject: [PATCH 3/3] macros: avoid '...' placeholder in FOLD_DIRECTIVE comment
 (source lint)

---
 src/common/include/shared_parallel_macros.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp
index 59bd15b8c7..75d0098fc8 100644
--- a/src/common/include/shared_parallel_macros.fpp
+++ b/src/common/include/shared_parallel_macros.fpp
@@ -121,7 +121,7 @@
 #:def FOLD_DIRECTIVE(directive, sentinel, width=200)
     #! Fold a long GPU directive across free-form continuation lines so it stays
     #! under nvfortran's ~1000-char source-line limit. Breaks only at whole-clause
-    #! boundaries (clause(...) groups and bare keywords), repeating the sentinel
+    #! boundaries (clause(args) groups and bare keywords), repeating the sentinel
     #! (e.g. '!$acc&') on each continuation -- which fypp's --no-folding cannot do
     #! because its generic folder omits the sentinel. Every emitted line is no
     #! longer than the prefix plus the single longest clause, i.e. no longer than