From 0d995d529cdf19e88d437ef208f2c37ff11ef458 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 13 Jun 2026 15:46:46 -0400 Subject: [PATCH 1/3] Fix silent viscosity loss on AMD flang GPU (host-capture Re_size; complete Riemann private lists) AMD flang reads the static declare-target Re_size stale across translation units, silently disabling viscosity. Host-capture it into Re_size_loc and firstprivate it into the Riemann kernels for all compilers. That firstprivate clause exposed a latent bug: several per-cell scalars (s_M/s_P/xi_M/xi_P in HLL, c_sum_Yi_Phi/flux_ene_e in HLLC, Gamm_L/Gamm_R/flux_tau_L/flux_tau_R in LF) were omitted from private() and relied on CCE-19's defaultmap(firstprivate:scalar) auto-privatization, which the firstprivate clause disrupts (gross wrong physics on Cray). Complete the private lists -- a no-op on every compiler -- so firstprivate is safe everywhere; no compiler gate needed. Verified on Frontier: AMD-flang 29/29 and Cray-CCE (rs=1/2/5) of the previously-failing cases pass. --- src/simulation/m_riemann_solver_hll.fpp | 19 ++++++----- src/simulation/m_riemann_solver_hllc.fpp | 42 +++++++++++++----------- src/simulation/m_riemann_solver_lf.fpp | 29 ++++++++-------- 3 files changed, 49 insertions(+), 41 deletions(-) diff --git a/src/simulation/m_riemann_solver_hll.fpp b/src/simulation/m_riemann_solver_hll.fpp index e235a1c0e4..64dea81ff6 100644 --- a/src/simulation/m_riemann_solver_hll.fpp +++ b/src/simulation/m_riemann_solver_hll.fpp @@ -89,14 +89,15 @@ contains real(wp) :: vel_L_tmp, vel_R_tmp real(wp) :: Ms_L, Ms_R, pres_SL, pres_SR real(wp) :: alpha_L_sum, alpha_R_sum - real(wp) :: zcoef, pcorr !< low Mach number correction + real(wp) :: zcoef, pcorr !< low Mach number correction type(riemann_states) :: c_fast, pres_mag type(riemann_states_vec3) :: B - type(riemann_states) :: Ga !< Gamma (Lorentz factor) + type(riemann_states) :: Ga !< Gamma (Lorentz factor) type(riemann_states) :: vdotB, B2 - type(riemann_states_vec3) :: b4 !< 4-magnetic field components (spatial: b4x, b4y, b4z) - type(riemann_states_vec3) :: cm !< Conservative momentum variables + type(riemann_states_vec3) :: b4 !< 4-magnetic field components (spatial: b4x, b4y, b4z) + type(riemann_states_vec3) :: cm !< Conservative momentum variables integer :: i, j, k, l, q !< Generic loop iterators + integer, dimension(2) :: Re_size_loc !< host copy of Re_size; amdflang reads the declare-target original stale cross-TU ! Populating the buffers of the left and right Riemann problem states variables, based on the choice of boundary conditions call s_populate_riemann_states_variables_buffers(qL_prim_rsx_vf, dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, & @@ -104,6 +105,7 @@ contains ! Reshaping inputted data based on dimensional splitting direction call s_initialize_riemann_solver(flux_src_vf, norm_dir) + Re_size_loc = Re_size #:for NORM_DIR, XYZ, STENCIL_VAR, COORDS, X_BND, Y_BND, Z_BND in & [(1, 'x', 'j', '{STENCIL_IDX}, k, l', 'is1', 'is2', 'is3'), & (2, 'y', 'k', 'j, {STENCIL_IDX}, l', 'is2', 'is1', 'is3'), & @@ -119,7 +121,8 @@ contains & Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, gamma_L, & & gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, & & gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, Ms_L, Ms_R, pres_SL, & - & pres_SR, alpha_L_sum, alpha_R_sum, flux_tau_L, flux_tau_R]', copyin='[norm_dir]') + & pres_SR, alpha_L_sum, alpha_R_sum, flux_tau_L, flux_tau_R, s_M, s_P, xi_M, xi_P]', & + & copyin='[norm_dir]', firstprivate='[Re_size_loc]') do l = ${Z_BND}$%beg, ${Z_BND}$%end do k = ${Y_BND}$%beg, ${Y_BND}$%end do j = ${X_BND}$%beg, ${X_BND}$%end @@ -216,11 +219,11 @@ contains Re_L(i) = dflt_real Re_R(i) = dflt_real - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + if (Re_size_loc(i) > 0) Re_L(i) = 0._wp + if (Re_size_loc(i) > 0) Re_R(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) + do q = 1, Re_size_loc(i) Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i) Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i) end do diff --git a/src/simulation/m_riemann_solver_hllc.fpp b/src/simulation/m_riemann_solver_hllc.fpp index f8ed19379e..f872833c9c 100644 --- a/src/simulation/m_riemann_solver_hllc.fpp +++ b/src/simulation/m_riemann_solver_hllc.fpp @@ -111,14 +111,15 @@ contains #:else real(wp), dimension(num_dims) :: xi_field_L, xi_field_R #:endif - real(wp) :: G_L, G_R - real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms - real(wp) :: vel_L_tmp, vel_R_tmp - real(wp) :: rho_Star, E_Star, p_Star, p_K_Star, vel_K_star - real(wp) :: pres_SL, pres_SR, Ms_L, Ms_R - real(wp) :: flux_ene_e - real(wp) :: zcoef, pcorr !< low Mach number correction - integer :: Re_max, i, j, k, l, q !< Generic loop iterators + real(wp) :: G_L, G_R + real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms + real(wp) :: vel_L_tmp, vel_R_tmp + real(wp) :: rho_Star, E_Star, p_Star, p_K_Star, vel_K_star + real(wp) :: pres_SL, pres_SR, Ms_L, Ms_R + real(wp) :: flux_ene_e + real(wp) :: zcoef, pcorr !< low Mach number correction + integer :: Re_max, i, j, k, l, q !< Generic loop iterators + integer, dimension(2) :: Re_size_loc !< host copy of Re_size; amdflang reads the declare-target original stale cross-TU ! Populating the buffers of the left and right Riemann problem states variables, based on the choice of boundary conditions call s_populate_riemann_states_variables_buffers(qL_prim_rsx_vf, dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, & @@ -127,6 +128,7 @@ contains ! Reshaping inputted data based on dimensional splitting direction call s_initialize_riemann_solver(flux_src_vf, norm_dir) + Re_size_loc = Re_size #:for NORM_DIR, XYZ, STENCIL_VAR, COORDS, X_BND, Y_BND, Z_BND in & [(1, 'x', 'j', '{STENCIL_IDX}, k, l', 'is1', 'is2', 'is3'), & @@ -147,7 +149,7 @@ contains & rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, & & vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, & & alpha_R_sum, rho_Star, E_Star, p_Star, p_K_Star, vel_K_star, s_L, s_R, s_M, s_P, s_S, & - & xi_M, xi_P, xi_L, xi_R, xi_L_m1, xi_R_m1, xi_MP, xi_PP]') + & xi_M, xi_P, xi_L, xi_R, xi_L_m1, xi_R_m1, xi_MP, xi_PP]', firstprivate='[Re_size_loc]') do l = ${Z_BND}$%beg, ${Z_BND}$%end do k = ${Y_BND}$%beg, ${Y_BND}$%end do j = ${X_BND}$%beg, ${X_BND}$%end @@ -229,10 +231,10 @@ contains do i = 1, 2 Re_L(i) = dflt_real Re_R(i) = dflt_real - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + if (Re_size_loc(i) > 0) Re_L(i) = 0._wp + if (Re_size_loc(i) > 0) Re_R(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) + do q = 1, Re_size_loc(i) Re_L(i) = qL_prim_rsx_vf(${SF('')}$, eqn_idx%E + Re_idx(i, q))/Res_gs(i, q) + Re_L(i) Re_R(i) = qR_prim_rsx_vf(${SF(' + 1')}$, eqn_idx%E + Re_idx(i, q))/Res_gs(i, & & q) + Re_R(i) @@ -782,7 +784,7 @@ contains & Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, s_L, s_R, s_M, s_P, s_S, xi_M, & & xi_P, xi_L, xi_R, xi_L_m1, xi_R_m1, xi_MP, xi_PP, nbub_L, nbub_R, PbwR3Lbar, PbwR3Rbar, & & R3Lbar, R3Rbar, R3V2Lbar, R3V2Rbar, Ys_L, Ys_R, Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, & - & Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2]') + & Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2]', firstprivate='[Re_size_loc]') do l = ${Z_BND}$%beg, ${Z_BND}$%end do k = ${Y_BND}$%beg, ${Y_BND}$%end do j = ${X_BND}$%beg, ${X_BND}$%end @@ -851,11 +853,11 @@ contains Re_L(i) = dflt_real Re_R(i) = dflt_real - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + if (Re_size_loc(i) > 0) Re_L(i) = 0._wp + if (Re_size_loc(i) > 0) Re_R(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) + do q = 1, Re_size_loc(i) Re_L(i) = (1._wp - qL_prim_rsx_vf(${SF('')}$, eqn_idx%E + Re_idx(i, & & q)))/Res_gs(i, q) + Re_L(i) Re_R(i) = (1._wp - qR_prim_rsx_vf(${SF(' + 1')}$, eqn_idx%E + Re_idx(i, & @@ -1179,7 +1181,7 @@ contains & vel_R, Re_L, Re_R, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef, & & vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, & & tau_e_L, tau_e_R, xi_field_L, xi_field_R, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, G_L, & - & G_R]', copyin='[is1, is2, is3]') + & G_R, c_sum_Yi_Phi, flux_ene_e]', copyin='[is1, is2, is3]', firstprivate='[Re_size_loc]') do l = ${Z_BND}$%beg, ${Z_BND}$%end do k = ${Y_BND}$%beg, ${Y_BND}$%end do j = ${X_BND}$%beg, ${X_BND}$%end @@ -1244,8 +1246,8 @@ contains end do Re_max = 0 - if (Re_size(1) > 0) Re_max = 1 - if (Re_size(2) > 0) Re_max = 2 + if (Re_size_loc(1) > 0) Re_max = 1 + if (Re_size_loc(2) > 0) Re_max = 2 if (viscous) then $:GPU_LOOP(parallelism='[seq]') @@ -1254,7 +1256,7 @@ contains Re_R(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) + do q = 1, Re_size_loc(i) Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i) Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i) end do diff --git a/src/simulation/m_riemann_solver_lf.fpp b/src/simulation/m_riemann_solver_lf.fpp index c3c3aaa959..76cb017c59 100644 --- a/src/simulation/m_riemann_solver_lf.fpp +++ b/src/simulation/m_riemann_solver_lf.fpp @@ -88,15 +88,16 @@ contains real(wp) :: vel_L_tmp, vel_R_tmp real(wp) :: Ms_L, Ms_R, pres_SL, pres_SR real(wp) :: alpha_L_sum, alpha_R_sum - real(wp) :: zcoef, pcorr !< low Mach number correction + real(wp) :: zcoef, pcorr !< low Mach number correction type(riemann_states) :: c_fast, pres_mag type(riemann_states_vec3) :: B - type(riemann_states) :: Ga !< Gamma (Lorentz factor) + type(riemann_states) :: Ga !< Gamma (Lorentz factor) type(riemann_states) :: vdotB, B2 - type(riemann_states_vec3) :: b4 !< 4-magnetic field components (spatial: b4x, b4y, b4z) - type(riemann_states_vec3) :: cm !< Conservative momentum variables - integer :: i, j, k, l, q !< Generic loop iterators + type(riemann_states_vec3) :: b4 !< 4-magnetic field components (spatial: b4x, b4y, b4z) + type(riemann_states_vec3) :: cm !< Conservative momentum variables + integer :: i, j, k, l, q !< Generic loop iterators integer, dimension(3) :: idx_right_phys !< Physical (j,k,l) indices for right state. + integer, dimension(2) :: Re_size_loc !< host copy of Re_size; amdflang reads the declare-target original stale cross-TU ! Populating the buffers of the left and right Riemann problem states variables, based on the choice of boundary conditions call s_populate_riemann_states_variables_buffers(qL_prim_rsx_vf, dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, & @@ -104,6 +105,7 @@ contains ! Reshaping inputted data based on dimensional splitting direction call s_initialize_riemann_solver(flux_src_vf, norm_dir) + Re_size_loc = Re_size #:for NORM_DIR, XYZ, STENCIL_VAR, COORDS, X_BND, Y_BND, Z_BND in & [(1, 'x', 'j', '{STENCIL_IDX}, k, l', 'is1', 'is2', 'is3'), & (2, 'y', 'k', 'j, {STENCIL_IDX}, l', 'is2', 'is1', 'is3'), & @@ -119,7 +121,8 @@ contains & vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, c_avg, pres_L, pres_R, & & rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, c_L, c_R, E_L, E_R, H_L, & & H_R, ptilde_L, ptilde_R, s_M, s_P, xi_M, xi_P, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, & - & Cp_L, Cp_R, Cv_L, Cv_R, R_gas_L, R_gas_R, MW_L, MW_R, T_L, T_R, Y_L, Y_R]') + & Cp_L, Cp_R, Cv_L, Cv_R, R_gas_L, R_gas_R, MW_L, MW_R, T_L, T_R, Y_L, Y_R, Gamm_L, Gamm_R, & + & flux_tau_L, flux_tau_R]', firstprivate='[Re_size_loc]') do l = ${Z_BND}$%beg, ${Z_BND}$%end do k = ${Y_BND}$%beg, ${Y_BND}$%end do j = ${X_BND}$%beg, ${X_BND}$%end @@ -216,11 +219,11 @@ contains Re_L(i) = dflt_real Re_R(i) = dflt_real - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + if (Re_size_loc(i) > 0) Re_L(i) = 0._wp + if (Re_size_loc(i) > 0) Re_R(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) + do q = 1, Re_size_loc(i) Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i) Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i) end do @@ -599,7 +602,7 @@ contains if (viscous) then $:GPU_PARALLEL_LOOP(collapse=3, private='[i, j, k, l, idx_right_phys, vel_grad_L, vel_grad_R, alpha_L, alpha_R, & - & vel_L, vel_R, Re_L, Re_R]', copyin='[norm_dir]') + & vel_L, vel_R, Re_L, Re_R]', copyin='[norm_dir]', firstprivate='[Re_size_loc]') do l = isz%beg, isz%end do k = isy%beg, isy%end do j = isx%beg, isx%end @@ -650,11 +653,11 @@ contains Re_L(i) = dflt_real Re_R(i) = dflt_real - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + if (Re_size_loc(i) > 0) Re_L(i) = 0._wp + if (Re_size_loc(i) > 0) Re_R(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) + do q = 1, Re_size_loc(i) Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) + Re_L(i) Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) + Re_R(i) end do From a462123281c2476b78444219ac9826038a0296ee Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 13 Jun 2026 23:50:27 -0400 Subject: [PATCH 2/3] macros: fold long GPU directives across continuation lines for nvfortran The host-capture firstprivate(Re_size_loc) added to the LF/HLL/HLLC solvers (the AMD-flang viscosity fix) pushed the longest GPU_PARALLEL_LOOP directive past nvfortran's ~1000-char source-line limit (1011 chars -> 'source line too long' / unbalanced parentheses; AMD and Cray accept the long line, nvfortran caps it). FOLD_DIRECTIVE wraps the assembled directive at whole-clause boundaries with repeated sentinels (!$acc&/!$omp&), so the longest emitted line is prefix+longest-single-clause (817 chars on LF) -- shorter than the single line a build with one fewer clause already compiles. firstprivate is preserved (AMD/Cray correctness) on its own continuation. Validated: nvfortran 25.5 -acc and -mp=gpu compile+run the folded marker-interleaved continuation correctly; the cpp line-markers between continuations are already emitted by master for regular Fortran continuations and compile on Cray/AMD CI. Short directives (<200 chars) are unchanged. --- src/common/include/acc_macros.fpp | 8 +++--- src/common/include/omp_macros.fpp | 4 +-- src/common/include/shared_parallel_macros.fpp | 25 +++++++++++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/common/include/acc_macros.fpp b/src/common/include/acc_macros.fpp index bd4284c01b..b91138f3b9 100644 --- a/src/common/include/acc_macros.fpp +++ b/src/common/include/acc_macros.fpp @@ -121,8 +121,8 @@ & copyout_val.strip('\n') + create_val.strip('\n') + & & no_create_val.strip('\n') + present_val.strip('\n') + & & deviceptr_val.strip('\n') + attach_val.strip('\n') - #:set acc_directive = '!$acc parallel ' + & - & acc_clause_val + extraAccArgs_val.strip('\n') + #:set acc_directive = FOLD_DIRECTIVE('!$acc parallel ' + & + & acc_clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n') #:set end_acc_directive = '!$acc end parallel' $:acc_directive $:code @@ -153,8 +153,8 @@ & copyout_val.strip('\n') + create_val.strip('\n') + & & no_create_val.strip('\n') + present_val.strip('\n') + & & deviceptr_val.strip('\n') + attach_val.strip('\n') - #:set acc_directive = '!$acc parallel loop ' + & - & clause_val + extraAccArgs_val.strip('\n') + #:set acc_directive = FOLD_DIRECTIVE('!$acc parallel loop ' + & + & clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n') $:acc_directive #:enddef diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp index 7620e7607f..cdf49db773 100644 --- a/src/common/include/omp_macros.fpp +++ b/src/common/include/omp_macros.fpp @@ -141,7 +141,7 @@ & deviceptr_val.strip('\n') + attach_val.strip('\n') #:set omp_clause_val = omp_clause_val.strip('\n') - #:set omp_directive = '!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n') + #:set omp_directive = FOLD_DIRECTIVE('!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n') #:set omp_end_directive = '!$omp end target teams' $:omp_directive @@ -186,7 +186,7 @@ #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' #:endif - #:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n') + #:set omp_directive = FOLD_DIRECTIVE(omp_start_directive + clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n') $:omp_directive #:enddef diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp index 36bee0a23a..59bd15b8c7 100644 --- a/src/common/include/shared_parallel_macros.fpp +++ b/src/common/include/shared_parallel_macros.fpp @@ -117,4 +117,29 @@ #:endif $:extraArgs_val #:enddef + +#:def FOLD_DIRECTIVE(directive, sentinel, width=200) + #! Fold a long GPU directive across free-form continuation lines so it stays + #! under nvfortran's ~1000-char source-line limit. Breaks only at whole-clause + #! boundaries (clause(...) groups and bare keywords), repeating the sentinel + #! (e.g. '!$acc&') on each continuation -- which fypp's --no-folding cannot do + #! because its generic folder omits the sentinel. Every emitted line is no + #! longer than the prefix plus the single longest clause, i.e. no longer than + #! the unfolded line a build with one fewer clause already compiles. + #:set _toks = re.findall(r'\w+\([^)]*\)|\S+', directive) + #:set _lines = [] + #:set _cur = '' + #:for _t in _toks + #:if _cur == '' + #:set _cur = _t + #:elif len(_cur) + 1 + len(_t) > width + #:set _lines = _lines + [_cur + ' &'] + #:set _cur = sentinel + '& ' + _t + #:else + #:set _cur = _cur + ' ' + _t + #:endif + #:endfor + #:set _lines = _lines + [_cur] + $:'\n'.join(_lines) +#:enddef ! New line at end of file is required for FYPP From ecf7f22e389b508dc826d9120832e3d4d288a64b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sun, 14 Jun 2026 00:36:05 -0400 Subject: [PATCH 3/3] macros: avoid '...' placeholder in FOLD_DIRECTIVE comment (source lint) --- src/common/include/shared_parallel_macros.fpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp index 59bd15b8c7..75d0098fc8 100644 --- a/src/common/include/shared_parallel_macros.fpp +++ b/src/common/include/shared_parallel_macros.fpp @@ -121,7 +121,7 @@ #:def FOLD_DIRECTIVE(directive, sentinel, width=200) #! Fold a long GPU directive across free-form continuation lines so it stays #! under nvfortran's ~1000-char source-line limit. Breaks only at whole-clause - #! boundaries (clause(...) groups and bare keywords), repeating the sentinel + #! boundaries (clause(args) groups and bare keywords), repeating the sentinel #! (e.g. '!$acc&') on each continuation -- which fypp's --no-folding cannot do #! because its generic folder omits the sentinel. Every emitted line is no #! longer than the prefix plus the single longest clause, i.e. no longer than