From 432ca03acd86afb23efa300d4aef2180e81e2e7a Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Nov 2025 08:55:39 +1100 Subject: [PATCH 1/7] port advect_x --- src/tracer/MOM_tracer_advect.F90 | 167 ++++++++++++++++++------------- 1 file changed, 96 insertions(+), 71 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index a2d43e80a9..7fc932fe70 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -270,7 +270,12 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! for all the transport to happen. The sum over domore_k keeps the processors ! synchronized. This may not be very efficient, but it should be reliable. - !$OMP parallel default(shared) + !!$OMP parallel default(shared) + !$omp target enter data map(to: hprev, uhr, uh_neglect, domore_u, local_advect_scheme, Reg%Tr(:), OBC) + + do m = 1, ntr + !$omp target enter data map(to: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%advection_xy) + enddo if (x_first) then @@ -282,6 +287,14 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first local_advect_scheme) endif ; enddo + !$omp target exit data map(from: hprev) + + ! transfer Tr(m) members before releasing Tr array + !$ do m = 1, ntr + !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%advection_xy) + !$ enddo + !$omp target exit data map(release: Reg%tr(:)) + !$OMP do ordered do k=1,nz ; if (domore_k(k) > 0) then ! Next, advect meridionally. @@ -290,7 +303,8 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! Update domore_k(k) for the next iteration domore_k(k) = 0 - do j=jsv-stencil,jev+stencil ; if (domore_u(j,k)) domore_k(k) = 1 ; enddo + do concurrent (j=jsv-stencil:jev+stencil) ; + if (domore_u(j,k)) domore_k(k) = 1 ; enddo do J=jsv-1,jev ; if (domore_v(J,k)) domore_k(k) = 1 ; enddo endif ; enddo @@ -313,13 +327,22 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! Update domore_k(k) for the next iteration domore_k(k) = 0 - do j=jsv,jev ; if (domore_u(j,k)) domore_k(k) = 1 ; enddo + do concurrent (j=jsv:jev) + if (domore_u(j,k)) domore_k(k) = 1 ; enddo do J=jsv-1,jev ; if (domore_v(J,k)) domore_k(k) = 1 ; enddo endif ; enddo + ! transfer Tr(m) members before releasing Tr array + !$ do m = 1, ntr + !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%advection_xy) + !$ enddo + !$omp target exit data map(release: Reg%tr(:)) + + !$omp target exit data map(from: hprev) endif ! x_first - !$OMP end parallel + !$omp target exit data map(from: uhr) map(release: uh_neglect, domore_u, local_advect_scheme, OBC) + !!$OMP end parallel ! If the advection just isn't finishing after max_iter, move on. if (itt >= max_iter) then @@ -432,14 +455,18 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & tiny_h = tiny(min_h) h_neglect = GV%H_subroundoff - do I=is-1,ie ; CFL(I) = 0.0 ; enddo + ! do I=is-1,ie ; CFL(I) = 0.0 ; enddo + !$omp target enter data & + !$omp map(alloc: slope_x, T_tmp, uhh, CFL, hlst, Ihnew, do_i, flux_x) + !$omp target teams loop private(slope_x, T_tmp, uhh, CFL, hlst, Ihnew, Tp, dMx, dMn, m, i, n, & + !$omp ntr_id, hup, hlos, i_up, Tc, Tm, aL, aR, dA, mA, a6) do j=js,je ; if (domore_u(j,k)) then domore_u(j,k) = .false. ! Calculate the i-direction profiles (slopes) of each tracer that is being advected. if (usePLMslope) then - do m=1,ntr ; do i=is-stencil,ie+stencil + do concurrent (m=1:ntr, i=is-stencil:ie+stencil) !if (ABS(Tr(m)%t(i+1,j,k)-Tr(m)%t(i,j,k)) < & ! ABS(Tr(m)%t(i,j,k)-Tr(m)%t(i-1,j,k))) then ! maxslope = 4.0*(Tr(m)%t(i+1,j,k)-Tr(m)%t(i,j,k)) @@ -459,47 +486,43 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & dMn= Tc - min( Tp, Tc, Tm ) slope_x(i,m) = G%mask2dCu(I,j)*G%mask2dCu(I-1,j) * & sign( min(0.5*abs(Tp-Tm), 2.0*dMx, 2.0*dMn), Tp-Tm ) - enddo ; enddo + enddo endif ! usePLMslope ! make a copy of the tracers in case values need to be overridden for OBCs - do m = 1,ntr - do i=G%isd,G%ied - T_tmp(i,m) = Tr(m)%t(i,j,k) - enddo + do concurrent (m = 1:ntr, i=G%isd:G%ied) + T_tmp(i,m) = Tr(m)%t(i,j,k) enddo ! loop through open boundaries and recalculate flux terms if (associated(OBC)) then ; if (OBC%OBC_pe) then do n=1,OBC%number_of_segments - segment=>OBC%segment(n) - if (.not. associated(segment%tr_Reg)) cycle - if (segment%is_E_or_W) then - if (j>=segment%HI%jsd .and. j<=segment%HI%jed) then - I = segment%HI%IsdB - do m = 1,segment%tr_Reg%ntseg ! replace tracers with OBC values - ntr_id = segment%tr_reg%Tr(m)%ntr_index - if (allocated(segment%tr_Reg%Tr(m)%tres)) then - if (segment%direction == OBC_DIRECTION_W) then - T_tmp(i,ntr_id) = segment%tr_Reg%Tr(m)%tres(i,j,k) + ! segment=>OBC%segment(n) + if (.not. associated(OBC%segment(n)%tr_Reg)) cycle + if (OBC%segment(n)%is_E_or_W) then + if (j>=OBC%segment(n)%HI%jsd .and. j<=OBC%segment(n)%HI%jed) then + I = OBC%segment(n)%HI%IsdB + do concurrent (m = 1:OBC%segment(n)%tr_Reg%ntseg) ! replace tracers with OBC values + ntr_id = OBC%segment(n)%tr_reg%Tr(m)%ntr_index + if (allocated(OBC%segment(n)%tr_Reg%Tr(m)%tres)) then + if (OBC%segment(n)%direction == OBC_DIRECTION_W) then + T_tmp(i,ntr_id) = OBC%segment(n)%tr_Reg%Tr(m)%tres(i,j,k) else - T_tmp(i+1,ntr_id) = segment%tr_Reg%Tr(m)%tres(i,j,k) + T_tmp(i+1,ntr_id) = OBC%segment(n)%tr_Reg%Tr(m)%tres(i,j,k) endif else - if (segment%direction == OBC_DIRECTION_W) then - T_tmp(i,ntr_id) = segment%tr_Reg%Tr(m)%OBC_inflow_conc + if (OBC%segment(n)%direction == OBC_DIRECTION_W) then + T_tmp(i,ntr_id) = OBC%segment(n)%tr_Reg%Tr(m)%OBC_inflow_conc else - T_tmp(i+1,ntr_id) = segment%tr_Reg%Tr(m)%OBC_inflow_conc + T_tmp(i+1,ntr_id) = OBC%segment(n)%tr_Reg%Tr(m)%OBC_inflow_conc endif endif enddo - do m = 1,ntr ! Apply update tracer values for slope calculation - do i=segment%HI%IsdB-1,segment%HI%IsdB+1 - Tp = T_tmp(i+1,m) ; Tc = T_tmp(i,m) ; Tm = T_tmp(i-1,m) - dMx = max( Tp, Tc, Tm ) - Tc - dMn= Tc - min( Tp, Tc, Tm ) - slope_x(i,m) = G%mask2dCu(I,j)*G%mask2dCu(I-1,j) * & - sign( min(0.5*abs(Tp-Tm), 2.0*dMx, 2.0*dMn), Tp-Tm ) - enddo + do concurrent (m = 1:ntr, i=OBC%segment(n)%HI%IsdB-1:OBC%segment(n)%HI%IsdB+1) ! Apply update tracer values for slope calculation + Tp = T_tmp(i+1,m) ; Tc = T_tmp(i,m) ; Tm = T_tmp(i-1,m) + dMx = max( Tp, Tc, Tm ) - Tc + dMn= Tc - min( Tp, Tc, Tm ) + slope_x(i,m) = G%mask2dCu(I,j)*G%mask2dCu(I-1,j) * & + sign( min(0.5*abs(Tp-Tm), 2.0*dMx, 2.0*dMn), Tp-Tm ) enddo endif @@ -512,7 +535,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & ! the minimum of the remaining mass flux (uhr) and the half the mass ! in the cell plus whatever part of its half of the mass flux that ! the flux through the other side does not require. - do I=is-1,ie + do concurrent (I=is-1:ie) if ((uhr(I,j,k) == 0.0) .or. & ((uhr(I,j,k) < 0.0) .and. (hprev(i+1,j,k) <= tiny_h)) .or. & ((uhr(I,j,k) > 0.0) .and. (hprev(i,j,k) <= tiny_h)) ) then @@ -543,10 +566,10 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & endif enddo - do m=1,ntr + do concurrent (m=1:ntr) if ((advect_schemes(m) == ADVECT_PPM) .or. (advect_schemes(m) == ADVECT_PPMH3)) then - do I=is-1,ie + do concurrent (I=is-1:ie) ! centre cell depending on upstream direction if (uhh(I) >= 0.0) then i_up = i @@ -587,7 +610,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & endif enddo else ! PLM - do I=is-1,ie + do concurrent (I=is-1:ie) if (uhh(I) >= 0.0) then ! Indirect implementation of PLM !aL = Tr(m)%t(i,j,k) - 0.5 * slope_x(i,m) @@ -612,22 +635,22 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & if (associated(OBC)) then ; if (OBC%OBC_pe) then if (OBC%specified_u_BCs_exist_globally .or. OBC%open_u_BCs_exist_globally) then do n=1,OBC%number_of_segments - segment=>OBC%segment(n) - if (.not. associated(segment%tr_Reg)) cycle - if (segment%is_E_or_W) then - if (j>=segment%HI%jsd .and. j<=segment%HI%jed) then - I = segment%HI%IsdB + ! segment=>OBC%segment(n) + if (.not. associated(OBC%segment(n)%tr_Reg)) cycle + if (OBC%segment(n)%is_E_or_W) then + if (j>=OBC%segment(n)%HI%jsd .and. j<=OBC%segment(n)%HI%jed) then + I = OBC%segment(n)%HI%IsdB ! Tracer fluxes are set to prescribed values only for inflows from masked areas. ! Now changing to simply fixed inflows. - if ((uhr(I,j,k) > 0.0) .and. (segment%direction == OBC_DIRECTION_W) .or. & - (uhr(I,j,k) < 0.0) .and. (segment%direction == OBC_DIRECTION_E)) then + if ((uhr(I,j,k) > 0.0) .and. (OBC%segment(n)%direction == OBC_DIRECTION_W) .or. & + (uhr(I,j,k) < 0.0) .and. (OBC%segment(n)%direction == OBC_DIRECTION_E)) then uhh(I) = uhr(I,j,k) ! should the reservoir evolve for this case Kate ?? - Nope - do m=1,segment%tr_Reg%ntseg - ntr_id = segment%tr_reg%Tr(m)%ntr_index - if (allocated(segment%tr_Reg%Tr(m)%tres)) then - flux_x(I,j,ntr_id) = uhh(I)*segment%tr_Reg%Tr(m)%tres(I,j,k) - else ; flux_x(I,j,ntr_id) = uhh(I)*segment%tr_Reg%Tr(m)%OBC_inflow_conc ; endif + do concurrent (m=1:OBC%segment(n)%tr_Reg%ntseg) + ntr_id = OBC%segment(n)%tr_reg%Tr(m)%ntr_index + if (allocated(OBC%segment(n)%tr_Reg%Tr(m)%tres)) then + flux_x(I,j,ntr_id) = uhh(I)*OBC%segment(n)%tr_Reg%Tr(m)%tres(I,j,k) + else ; flux_x(I,j,ntr_id) = uhh(I)*OBC%segment(n)%tr_Reg%Tr(m)%OBC_inflow_conc ; endif enddo endif endif @@ -637,21 +660,21 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & if (OBC%open_u_BCs_exist_globally) then do n=1,OBC%number_of_segments - segment=>OBC%segment(n) - I = segment%HI%IsdB - if (segment%is_E_or_W .and. (j >= segment%HI%jsd .and. j<= segment%HI%jed)) then - if (segment%specified) cycle - if (.not. associated(segment%tr_Reg)) cycle + ! segment=>OBC%segment(n) + I = OBC%segment(n)%HI%IsdB + if (OBC%segment(n)%is_E_or_W .and. (j >= OBC%segment(n)%HI%jsd .and. j<= OBC%segment(n)%HI%jed)) then + if (OBC%segment(n)%specified) cycle + if (.not. associated(OBC%segment(n)%tr_Reg)) cycle ! Tracer fluxes are set to prescribed values only for inflows from masked areas. if ((uhr(I,j,k) > 0.0) .and. (G%mask2dT(i,j) < 0.5) .or. & (uhr(I,j,k) < 0.0) .and. (G%mask2dT(i+1,j) < 0.5)) then uhh(I) = uhr(I,j,k) - do m=1,segment%tr_Reg%ntseg - ntr_id = segment%tr_reg%Tr(m)%ntr_index - if (allocated(segment%tr_Reg%Tr(m)%tres)) then - flux_x(I,j,ntr_id) = uhh(I)*segment%tr_Reg%Tr(m)%tres(I,j,k) - else; flux_x(I,j,ntr_id) = uhh(I)*segment%tr_Reg%Tr(m)%OBC_inflow_conc; endif + do concurrent (m=1:OBC%segment(n)%tr_Reg%ntseg) + ntr_id = OBC%segment(n)%tr_reg%Tr(m)%ntr_index + if (allocated(OBC%segment(n)%tr_Reg%Tr(m)%tres)) then + flux_x(I,j,ntr_id) = uhh(I)*OBC%segment(n)%tr_Reg%Tr(m)%tres(I,j,k) + else; flux_x(I,j,ntr_id) = uhh(I)*OBC%segment(n)%tr_Reg%Tr(m)%OBC_inflow_conc; endif enddo endif endif @@ -661,11 +684,11 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & ! Calculate new tracer concentration in each cell after accounting ! for the i-direction fluxes. - do I=is-1,ie + do concurrent (I=is-1:ie) uhr(I,j,k) = uhr(I,j,k) - uhh(I) if (abs(uhr(I,j,k)) < uh_neglect(I,j)) uhr(I,j,k) = 0.0 enddo - do i=is,ie + do concurrent (i=is:ie) if ((uhh(I) /= 0.0) .or. (uhh(I-1) /= 0.0)) then do_i(i,j) = .true. hlst(i) = hprev(i,j,k) @@ -684,7 +707,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & if (associated(OBC)) then if ((.not.OBC%exterior_OBC_bug) .and. (OBC%OBC_pe)) then if (OBC%specified_u_BCs_exist_globally .or. OBC%open_u_BCs_exist_globally) then - do i=is,ie-1 + do concurrent (i=is:ie-1) if (OBC%segnum_u(I,j) > 0) do_i(i+1,j) = .false. ! OBC_DIRECTION_E if (OBC%segnum_u(I,j) < 0) do_i(i,j) = .false. ! OBC_DIRECTION_W enddo @@ -693,10 +716,10 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & endif ! update tracer concentration from i-flux and save some diagnostics - do m=1,ntr + do concurrent (m=1:ntr) ! update tracer - do i=is,ie + do concurrent (i=is:ie) if (do_i(i,j)) then if (Ihnew(i) > 0.0) then Tr(m)%t(i,j,k) = (Tr(m)%t(i,j,k) * hlst(i) - & @@ -706,14 +729,14 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & enddo ! diagnostics - if (associated(Tr(m)%ad_x)) then ; do I=is-1,ie ; if (do_i(i,j) .or. do_i(i+1,j)) then + if (associated(Tr(m)%ad_x)) then ; do concurrent (I=is-1:ie) ; if (do_i(i,j) .or. do_i(i+1,j)) then Tr(m)%ad_x(I,j,k) = Tr(m)%ad_x(I,j,k) + flux_x(I,j,m)*Idt endif ; enddo ; endif ! diagnose convergence of flux_x (do not use the Ihnew(i) part of the logic). ! division by areaT to get into W/m2 for heat and kg/(s*m2) for salt. if (associated(Tr(m)%advection_xy)) then - do i=is,ie ; if (do_i(i,j)) then + do concurrent (i=is:ie) ; if (do_i(i,j)) then Tr(m)%advection_xy(i,j,k) = Tr(m)%advection_xy(i,j,k) - & (flux_x(I,j,m) - flux_x(I-1,j,m)) * & Idt * G%IareaT(i,j) @@ -725,24 +748,26 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & endif ; enddo ! End of j-loop. ! Do user controlled underflow of the tracer concentrations. - do m=1,ntr ; if (Tr(m)%conc_underflow > 0.0) then - do j=js,je ; do i=is,ie + do concurrent (m=1:ntr) ; if (Tr(m)%conc_underflow > 0.0) then + do concurrent (j=js:je, i=is:ie) if (abs(Tr(m)%t(i,j,k)) < Tr(m)%conc_underflow) Tr(m)%t(i,j,k) = 0.0 - enddo ; enddo + enddo endif ; enddo ! compute ad2d_x diagnostic outside above j-loop so as to make the summation ordered when OMP is active. !$OMP ordered do m=1,ntr ; if (associated(Tr(m)%ad2d_x)) then - do j=js,je ; if (domore_u_initial(j,k)) then - do I=is-1,ie ; if (do_i(i,j) .or. do_i(i+1,j)) then + do concurrent (j=js:je) ; if (domore_u_initial(j,k)) then + do concurrent (I=is-1:ie) ; if (do_i(i,j) .or. do_i(i+1,j)) then Tr(m)%ad2d_x(I,j) = Tr(m)%ad2d_x(I,j) + flux_x(I,j,m)*Idt endif ; enddo endif ; enddo endif ; enddo ! End of m-loop. !$OMP end ordered + !$omp target exit data & + !$omp map(release: slope_x, T_tmp, uhh, CFL, hlst, Ihnew, do_i, flux_x) end subroutine advect_x !> This subroutine does 1-d flux-form advection using a monotonic piecewise From d42aa8de0500cefc52c0e1c1dfe0563a47d650a7 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 27 Mar 2026 15:54:00 +1100 Subject: [PATCH 2/7] add tmp domore_u_jk fixes issue where compiler couldn't figure out that domore_u(j,k) should be reduced on in a nested loop --- src/tracer/MOM_tracer_advect.F90 | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index 7fc932fe70..2bec405432 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -433,7 +433,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & real :: mA ! Average of the reconstruction tracer edge values [conc] real :: a6 ! Curvature of the reconstruction tracer values [conc] logical :: do_i(SZI_(G),SZJ_(G)) ! If true, work on given points. - logical :: usePLMslope + logical :: usePLMslope, domore_u_jk integer :: i, j, m, n, i_up, stencil, ntr_id type(OBC_segment_type), pointer :: segment=>NULL() logical, dimension(SZJ_(G),SZK_(GV)) :: domore_u_initial @@ -460,9 +460,9 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & !$omp target enter data & !$omp map(alloc: slope_x, T_tmp, uhh, CFL, hlst, Ihnew, do_i, flux_x) !$omp target teams loop private(slope_x, T_tmp, uhh, CFL, hlst, Ihnew, Tp, dMx, dMn, m, i, n, & - !$omp ntr_id, hup, hlos, i_up, Tc, Tm, aL, aR, dA, mA, a6) + !$omp ntr_id, hup, hlos, i_up, Tc, Tm, aL, aR, dA, mA, a6, domore_u_jk) do j=js,je ; if (domore_u(j,k)) then - domore_u(j,k) = .false. + domore_u_jk = .false. ! Calculate the i-direction profiles (slopes) of each tracer that is being advected. if (usePLMslope) then @@ -535,7 +535,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & ! the minimum of the remaining mass flux (uhr) and the half the mass ! in the cell plus whatever part of its half of the mass flux that ! the flux through the other side does not require. - do concurrent (I=is-1:ie) + do concurrent (I=is-1:ie) reduce(.or.:domore_u_jk) if ((uhr(I,j,k) == 0.0) .or. & ((uhr(I,j,k) < 0.0) .and. (hprev(i+1,j,k) <= tiny_h)) .or. & ((uhr(I,j,k) > 0.0) .and. (hprev(i,j,k) <= tiny_h)) ) then @@ -547,7 +547,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & if ((((hup - hlos) + uhr(I,j,k)) < 0.0) .and. & ((0.5*hup + uhr(I,j,k)) < 0.0)) then uhh(I) = MIN(-0.5*hup, -hup+hlos, 0.0) - domore_u(j,k) = .true. + domore_u_jk = .true. else uhh(I) = uhr(I,j,k) endif @@ -558,13 +558,15 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & if ((((hup - hlos) - uhr(I,j,k)) < 0.0) .and. & ((0.5*hup - uhr(I,j,k)) < 0.0)) then uhh(I) = MAX(0.5*hup, hup-hlos, 0.0) - domore_u(j,k) = .true. + domore_u_jk = .true. else uhh(I) = uhr(I,j,k) endif CFL(I) = uhh(I) / (hprev(i,j,k)) ! CFL is positive endif enddo + + domore_u(j,k) = domore_u_jk do concurrent (m=1:ntr) From 86ffae6f71364a99babb964928def4667c516d07 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 18 Nov 2025 13:49:33 +1100 Subject: [PATCH 3/7] port advect_y --- src/tracer/MOM_tracer_advect.F90 | 164 +++++++++++++++---------------- 1 file changed, 80 insertions(+), 84 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index 2bec405432..403a0aa94b 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -271,11 +271,11 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! synchronized. This may not be very efficient, but it should be reliable. !!$OMP parallel default(shared) - !$omp target enter data map(to: hprev, uhr, uh_neglect, domore_u, local_advect_scheme, Reg%Tr(:), OBC) + !$omp target enter data map(to: hprev, uhr, vhr, uh_neglect, vh_neglect, domore_u, domore_v, local_advect_scheme, Reg%Tr(:), OBC) - do m = 1, ntr - !$omp target enter data map(to: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%advection_xy) - enddo + !$ do m = 1, ntr + !$omp target enter data map(to: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) + !$ enddo if (x_first) then @@ -287,14 +287,6 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first local_advect_scheme) endif ; enddo - !$omp target exit data map(from: hprev) - - ! transfer Tr(m) members before releasing Tr array - !$ do m = 1, ntr - !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%advection_xy) - !$ enddo - !$omp target exit data map(release: Reg%tr(:)) - !$OMP do ordered do k=1,nz ; if (domore_k(k) > 0) then ! Next, advect meridionally. @@ -303,9 +295,8 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! Update domore_k(k) for the next iteration domore_k(k) = 0 - do concurrent (j=jsv-stencil:jev+stencil) ; - if (domore_u(j,k)) domore_k(k) = 1 ; enddo - do J=jsv-1,jev ; if (domore_v(J,k)) domore_k(k) = 1 ; enddo + do concurrent (j=jsv-stencil:jev+stencil, domore_u(j,k)) ; domore_k(k) = 1 ; enddo + do concurrent (J=jsv-1:jev, domore_v(J,k)) ; domore_k(k) = 1 ; enddo endif ; enddo @@ -327,21 +318,18 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! Update domore_k(k) for the next iteration domore_k(k) = 0 - do concurrent (j=jsv:jev) - if (domore_u(j,k)) domore_k(k) = 1 ; enddo - do J=jsv-1,jev ; if (domore_v(J,k)) domore_k(k) = 1 ; enddo + do concurrent (j=jsv:jev, domore_u(j,k)) ; domore_k(k) = 1 ; enddo + do concurrent (J=jsv-1:jev, domore_v(J,k)) ; domore_k(k) = 1 ; enddo endif ; enddo - ! transfer Tr(m) members before releasing Tr array - !$ do m = 1, ntr - !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%advection_xy) - !$ enddo - !$omp target exit data map(release: Reg%tr(:)) - - !$omp target exit data map(from: hprev) endif ! x_first - !$omp target exit data map(from: uhr) map(release: uh_neglect, domore_u, local_advect_scheme, OBC) + ! transfer Tr(m) members before releasing Tr array + !$ do m = 1, ntr + !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) + !$ enddo + + !$omp target exit data map(from: uhr, vhr, hprev) map(release: uh_neglect, vh_neglect, domore_u, domore_v, local_advect_scheme, Reg%tr(:), OBC) !!$OMP end parallel ! If the advection just isn't finishing after max_iter, move on. @@ -848,6 +836,8 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & tiny_h = tiny(min_h) h_neglect = GV%H_subroundoff + !$omp target enter data map(alloc: vhh, T_tmp, slope_y, flux_y, domore_v_initial, do_j_tr, do_i) + ! We conditionally perform work on tracer points: calculating the PLM slope, ! and updating tracer concentration within a cell ! this depends on whether there is a flux which would affect this tracer point, @@ -858,16 +848,20 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & ! Note: this does lead to unnecessary work in updating tracer concentrations, ! since that doesn't need a wider stencil with the PPM advection scheme, but ! this would require an additional loop, etc. - do_j_tr(:) = .false. - do J=js-1,je - if (domore_v(J,k)) then ; do j2=1-stencil,stencil ; do_j_tr(j+j2) = .true. ; enddo ; endif + do concurrent (j=SZJ_(G)) + do_j_tr(j) = .false. + enddo + do concurrent (J=js-1:je, domore_v(J,k)) + do concurrent (j2=1-stencil:stencil) ; do_j_tr(j+j2) = .true. ; enddo + enddo + do concurrent (j=SZJB_(G)) + domore_v_initial(j) = domore_v(j,k) enddo - domore_v_initial(:) = domore_v(:,k) ! Calculate the j-direction profiles (slopes) of each tracer that ! is being advected. if (usePLMslope) then - do j=js-stencil,je+stencil ; if (do_j_tr(j)) then ; do m=1,ntr ; do i=is,ie + do concurrent (j=js-stencil:je+stencil, do_j_tr(j)) ; do concurrent (m=1:ntr, i=is:ie) !if (ABS(Tr(m)%t(i,j+1,k)-Tr(m)%t(i,j,k)) < & ! ABS(Tr(m)%t(i,j,k)-Tr(m)%t(i,j-1,k))) then ! maxslope = 4.0*(Tr(m)%t(i,j+1,k)-Tr(m)%t(i,j,k)) @@ -887,15 +881,15 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & dMn = Tc - min( Tp, Tc, Tm ) slope_y(i,m,j) = G%mask2dCv(i,J)*G%mask2dCv(i,J-1) * & sign( min(0.5*abs(Tp-Tm), 2.0*dMx, 2.0*dMn), Tp-Tm ) - enddo ; enddo ; endif ; enddo ! End of i-, m-, & j- loops. + enddo ; enddo ! End of i-, m-, & j- loops. endif ! usePLMslope ! make a copy of the tracers in case values need to be overridden for OBCs - do j=G%jsd,G%jed ; do m=1,ntr ; do i=G%isd,G%ied + do concurrent (j=G%jsd:G%jed, m=1:ntr, i=G%isd:G%ied) T_tmp(i,m,j) = Tr(m)%t(i,j,k) - enddo ; enddo ; enddo + enddo ! loop through open boundaries and recalculate flux terms if (associated(OBC)) then ; if (OBC%OBC_pe) then @@ -941,10 +935,12 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & ! the minimum of the remaining mass flux (vhr) and the half the mass ! in the cell plus whatever part of its half of the mass flux that ! the flux through the other side does not require. + !$omp target teams loop & + !$omp private(CFL, Ihnew, hup, hlos, j_up, Tp, Tc, Tm, i, m, aL, aR, dA, mA, a6, n, ntr_id) do J=js-1,je ; if (domore_v(J,k)) then domore_v(J,k) = .false. - do i=is,ie + do concurrent (i=is:ie) if ((vhr(i,J,k) == 0.0) .or. & ((vhr(i,J,k) < 0.0) .and. (hprev(i,j+1,k) <= tiny_h)) .or. & ((vhr(i,J,k) > 0.0) .and. (hprev(i,j,k) <= tiny_h)) ) then @@ -975,10 +971,10 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & endif enddo - do m=1,ntr + do concurrent (m=1:ntr) if ((advect_schemes(m) == ADVECT_PPM) .or. (advect_schemes(m) == ADVECT_PPMH3)) then - do i=is,ie + do concurrent (i=is:ie) ! centre cell depending on upstream direction if (vhh(i,J) >= 0.0) then j_up = j @@ -1019,7 +1015,7 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & endif enddo else ! PLM - do i=is,ie + do concurrent (i=is:ie) if (vhh(i,J) >= 0.0) then ! Indirect implementation of PLM !aL = Tr(m)%t(i,j,k) - 0.5 * slope_y(i,m,j) @@ -1044,20 +1040,19 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & if (associated(OBC)) then ; if (OBC%OBC_pe) then if (OBC%specified_v_BCs_exist_globally .or. OBC%open_v_BCs_exist_globally) then do n=1,OBC%number_of_segments - segment=>OBC%segment(n) - if (.not. segment%specified) cycle - if (.not. associated(segment%tr_Reg)) cycle + if (.not. OBC%segment(n)%specified) cycle + if (.not. associated(OBC%segment(n)%tr_Reg)) cycle if (OBC%segment(n)%is_N_or_S) then - if (J >= segment%HI%JsdB .and. J<= segment%HI%JedB) then - do i=segment%HI%isd,segment%HI%ied + if (J >= OBC%segment(n)%HI%JsdB .and. J<= OBC%segment(n)%HI%JedB) then + do i=OBC%segment(n)%HI%isd,OBC%segment(n)%HI%ied ! Tracer fluxes are set to prescribed values only for inflows from masked areas. ! Now changing to simply fixed inflows. - if ((vhr(i,J,k) > 0.0) .and. (segment%direction == OBC_DIRECTION_S) .or. & - (vhr(i,J,k) < 0.0) .and. (segment%direction == OBC_DIRECTION_N)) then + if ((vhr(i,J,k) > 0.0) .and. (OBC%segment(n)%direction == OBC_DIRECTION_S) .or. & + (vhr(i,J,k) < 0.0) .and. (OBC%segment(n)%direction == OBC_DIRECTION_N)) then vhh(i,J) = vhr(i,J,k) - do m=1,segment%tr_Reg%ntseg - ntr_id = segment%tr_reg%Tr(m)%ntr_index - if (allocated(segment%tr_Reg%Tr(m)%tres)) then + do m=1,OBC%segment(n)%tr_Reg%ntseg + ntr_id = OBC%segment(n)%tr_reg%Tr(m)%ntr_index + if (allocated(OBC%segment(n)%tr_Reg%Tr(m)%tres)) then flux_y(i,ntr_id,J) = vhh(i,J)*OBC%segment(n)%tr_Reg%Tr(m)%tres(i,J,k) else flux_y(i,ntr_id,J) = vhh(i,J)*OBC%segment(n)%tr_Reg%Tr(m)%OBC_inflow_conc @@ -1072,20 +1067,19 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & if (OBC%open_v_BCs_exist_globally) then do n=1,OBC%number_of_segments - segment=>OBC%segment(n) - if (segment%specified) cycle - if (.not. associated(segment%tr_Reg)) cycle - if (segment%is_N_or_S .and. (J >= segment%HI%JsdB .and. J<= segment%HI%JedB)) then - do i=segment%HI%isd,segment%HI%ied + if (OBC%segment(n)%specified) cycle + if (.not. associated(OBC%segment(n)%tr_Reg)) cycle + if (OBC%segment(n)%is_N_or_S .and. (J >= OBC%segment(n)%HI%JsdB .and. J<= OBC%segment(n)%HI%JedB)) then + do i=OBC%segment(n)%HI%isd,OBC%segment(n)%HI%ied ! Tracer fluxes are set to prescribed values only for inflows from masked areas. if ((vhr(i,J,k) > 0.0) .and. (G%mask2dT(i,j) < 0.5) .or. & (vhr(i,J,k) < 0.0) .and. (G%mask2dT(i,j+1) < 0.5)) then vhh(i,J) = vhr(i,J,k) - do m=1,segment%tr_Reg%ntseg - ntr_id = segment%tr_reg%Tr(m)%ntr_index - if (allocated(segment%tr_Reg%Tr(m)%tres)) then - flux_y(i,ntr_id,J) = vhh(i,J)*segment%tr_Reg%Tr(m)%tres(i,J,k) - else ; flux_y(i,ntr_id,J) = vhh(i,J)*segment%tr_Reg%Tr(m)%OBC_inflow_conc ; endif + do m=1,OBC%segment(n)%tr_Reg%ntseg + ntr_id = OBC%segment(n)%tr_reg%Tr(m)%ntr_index + if (allocated(OBC%segment(n)%tr_Reg%Tr(m)%tres)) then + flux_y(i,ntr_id,J) = vhh(i,J)*OBC%segment(n)%tr_Reg%Tr(m)%tres(i,J,k) + else ; flux_y(i,ntr_id,J) = vhh(i,J)*OBC%segment(n)%tr_Reg%Tr(m)%OBC_inflow_conc ; endif enddo endif enddo @@ -1095,19 +1089,20 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & endif ; endif else ! not domore_v. - do i=is,ie ; vhh(i,J) = 0.0 ; enddo - do m=1,ntr ; do i=is,ie ; flux_y(i,m,J) = 0.0 ; enddo ; enddo + do concurrent (i=is:ie) ; vhh(i,J) = 0.0 ; enddo + do concurrent (m=1:ntr, i=is:ie) ; flux_y(i,m,J) = 0.0 ; enddo endif ; enddo ! End of j-loop - do J=js-1,je ; do i=is,ie + do concurrent (J=js-1:je, i=is:ie) vhr(i,J,k) = vhr(i,J,k) - vhh(i,J) if (abs(vhr(i,J,k)) < vh_neglect(i,J)) vhr(i,J,k) = 0.0 - enddo ; enddo + enddo ! Calculate new tracer concentration in each cell after accounting ! for the j-direction fluxes. + !$omp target teams loop private(hlst, Ihnew, i, m) do j=js,je ; if (do_j_tr(j)) then - do i=is,ie + do concurrent (i=is:ie) if ((vhh(i,J) /= 0.0) .or. (vhh(i,J-1) /= 0.0)) then do_i(i,j) = .true. hlst(i) = hprev(i,j,k) @@ -1124,60 +1119,61 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & if (associated(OBC)) then if ((OBC%exterior_OBC_bug .eqv. .false.) .and. (OBC%OBC_pe)) then if (OBC%specified_v_BCs_exist_globally .or. OBC%open_v_BCs_exist_globally) then - do i=is,ie - if (OBC%segnum_v(i,J-1) > 0) do_i(i,j) = .false. ! OBC_DIRECTION_N - if (OBC%segnum_v(i,J) < 0) do_i(i,j) = .false. ! OBC_DIRECTION_S + do concurrent (i=is:ie, OBC%segnum_v(i,J-1) > 0 .or. OBC%segnum_v(i,J) < 0) ! OBC_DIRECTION_N or OBC_DIRECTION_S + do_i(i,j) = .false. enddo endif endif endif ! update tracer and save some diagnostics - do m=1,ntr - do i=is,ie ; if (do_i(i,j)) then + do concurrent (m=1:ntr) + do concurrent (i=is:ie, do_i(i,j)) Tr(m)%t(i,j,k) = (Tr(m)%t(i,j,k) * hlst(i) - & (flux_y(i,m,J) - flux_y(i,m,J-1))) * Ihnew(i) - endif ; enddo + enddo ! diagnose convergence of flux_y and add to convergence of flux_x. ! division by areaT to get into W/m2 for heat and kg/(s*m2) for salt. if (associated(Tr(m)%advection_xy)) then - do i=is,ie ; if (do_i(i,j)) then + do concurrent (i=is:ie, do_i(i,j)) Tr(m)%advection_xy(i,j,k) = Tr(m)%advection_xy(i,j,k) - & (flux_y(i,m,J) - flux_y(i,m,J-1))* Idt * & G%IareaT(i,j) - endif ; enddo + enddo endif enddo endif ; enddo ! End of j-loop. ! Do user controlled underflow of the tracer concentrations. - do m=1,ntr ; if (Tr(m)%conc_underflow > 0.0) then - do j=js,je ; do i=is,ie - if (abs(Tr(m)%t(i,j,k)) < Tr(m)%conc_underflow) Tr(m)%t(i,j,k) = 0.0 - enddo ; enddo - endif ; enddo + do concurrent (m=1:ntr, Tr(m)%conc_underflow > 0.0) + do concurrent (j=js:je, i=is:ie, abs(Tr(m)%t(i,j,k)) < Tr(m)%conc_underflow) + Tr(m)%t(i,j,k) = 0.0 + enddo + enddo ! compute ad_y and ad2d_y diagnostic outside above j-loop so as to make the summation ordered when OMP is active. !$OMP ordered do m=1,ntr ; if (associated(Tr(m)%ad_y)) then - do J=js-1,je ; if (domore_v_initial(J)) then - do i=is,ie ; if (do_i(i,j) .or. do_i(i,j+1)) then + do concurrent (J=js-1:je, domore_v_initial(J)) + do concurrent (i=is:ie, do_i(i,j) .or. do_i(i,j+1)) Tr(m)%ad_y(i,J,k) = Tr(m)%ad_y(i,J,k) + flux_y(i,m,J)*Idt - endif ; enddo - endif ; enddo + enddo + enddo endif ; enddo ! End of m-loop. do m=1,ntr ; if (associated(Tr(m)%ad2d_y)) then - do J=js-1,je ; if (domore_v_initial(J)) then - do i=is,ie ; if (do_i(i,j) .or. do_i(i,j+1)) then + do concurrent (J=js-1:je, domore_v_initial(J)) + do concurrent (i=is:ie, do_i(i,j) .or. do_i(i,j+1)) Tr(m)%ad2d_y(i,J) = Tr(m)%ad2d_y(i,J) + flux_y(i,m,J)*Idt - endif ; enddo - endif ; enddo + enddo + enddo endif ; enddo ! End of m-loop. !$OMP end ordered + !$omp target exit data map(release: vhh, T_tmp, slope_y, flux_y, domore_v_initial, do_j_tr, do_i) + end subroutine advect_y !> Initialize lateral tracer advection module From 919a0d905708c3fc4b2607cd746b2c0c11d6ac3e Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 27 Mar 2026 16:02:43 +1100 Subject: [PATCH 4/7] add tmp domore_v_jk --- src/tracer/MOM_tracer_advect.F90 | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index 403a0aa94b..cd5d991a6b 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -821,7 +821,7 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & logical :: usePLMslope integer :: i, j, j2, m, n, j_up, stencil, ntr_id type(OBC_segment_type), pointer :: segment=>NULL() - logical :: domore_v_initial(SZJB_(G)) ! Initial state of domore_v + logical :: domore_v_initial(SZJB_(G)), domore_v_jk ! Initial state of domore_v usePLMslope = .false. ! stencil for calculating slope values @@ -936,11 +936,12 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & ! in the cell plus whatever part of its half of the mass flux that ! the flux through the other side does not require. !$omp target teams loop & - !$omp private(CFL, Ihnew, hup, hlos, j_up, Tp, Tc, Tm, i, m, aL, aR, dA, mA, a6, n, ntr_id) + !$omp private(CFL, Ihnew, hup, hlos, j_up, Tp, Tc, Tm, i, m, aL, aR, dA, mA, a6, n, ntr_id, & + !$omp domore_v_jk) do J=js-1,je ; if (domore_v(J,k)) then - domore_v(J,k) = .false. + domore_v_jk = .false. - do concurrent (i=is:ie) + do concurrent (i=is:ie) reduce(.or.:domore_v_jk) if ((vhr(i,J,k) == 0.0) .or. & ((vhr(i,J,k) < 0.0) .and. (hprev(i,j+1,k) <= tiny_h)) .or. & ((vhr(i,J,k) > 0.0) .and. (hprev(i,j,k) <= tiny_h)) ) then @@ -952,7 +953,7 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & if ((((hup - hlos) + vhr(i,J,k)) < 0.0) .and. & ((0.5*hup + vhr(i,J,k)) < 0.0)) then vhh(i,J) = MIN(-0.5*hup, -hup+hlos, 0.0) - domore_v(J,k) = .true. + domore_v_jk = .true. else vhh(i,J) = vhr(i,J,k) endif @@ -963,7 +964,7 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & if ((((hup - hlos) - vhr(i,J,k)) < 0.0) .and. & ((0.5*hup - vhr(i,J,k)) < 0.0)) then vhh(i,J) = MAX(0.5*hup, hup-hlos, 0.0) - domore_v(J,k) = .true. + domore_v_jk = .true. else vhh(i,J) = vhr(i,J,k) endif @@ -971,6 +972,8 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & endif enddo + domore_v(j,k) = domore_v_jk + do concurrent (m=1:ntr) if ((advect_schemes(m) == ADVECT_PPM) .or. (advect_schemes(m) == ADVECT_PPMH3)) then From 014d951d4f25d7584cfd431b55d10a370ca57267 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 19 Nov 2025 08:54:22 +1100 Subject: [PATCH 5/7] port advect_tracer --- src/tracer/MOM_tracer_advect.F90 | 270 +++++++++++++++++++------------ 1 file changed, 163 insertions(+), 107 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index cd5d991a6b..56dc00a288 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -114,8 +114,16 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first integer :: stencil_local ! Stencil for the local adection scheme integer :: local_advect_scheme(Reg%ntr) ! contains the list of the advection for each tracer - domore_u(:,:) = .false. - domore_v(:,:) = .false. + !$omp target update to(uhtr, vhtr, h_end) + + !$omp target enter data map(to: Reg, Reg%Tr(:), CS, OBC) map(alloc: domore_u, domore_v, uhr, vhr, uh_neglect, vh_neglect, hprev) + + do concurrent (k=1:GV%ke, j=G%jsd:G%jed) + domore_u(j,k) = .false. + enddo + do concurrent (k=1:GV%ke, j=G%jsdB:G%jedB) + domore_v(j,k) = .false. + enddo is = G%isc ; ie = G%iec ; js = G%jsc ; je = G%jec ; nz = GV%ke isd = G%isd ; ied = G%ied ; jsd = G%jsd ; jed = G%jed IsdB = G%IsdB ; IedB = G%IedB ; JsdB = G%JsdB ; JedB = G%JedB @@ -134,7 +142,7 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first x_first = (MOD(G%first_direction,2) == 0) ! Choose the maximum stencil from all the local advection scheme - do m = 1,ntr + do concurrent (m = 1:ntr) local_advect_scheme(m) = Reg%Tr(m)%advect_scheme if(local_advect_scheme(m) < 0) local_advect_scheme(m) = CS%default_advect_scheme @@ -167,27 +175,37 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first call create_group_pass(CS%pass_uhr_vhr_t_hprev, hprev, G%Domain) do m=1,ntr call create_group_pass(CS%pass_uhr_vhr_t_hprev, Reg%Tr(m)%t, G%Domain) + !$omp target enter data map(to: Reg%Tr(m)%t) map(alloc: Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) enddo call cpu_clock_end(id_clock_pass) - !$OMP parallel default(shared) + !$omp target enter data map(to: local_advect_scheme) ! This initializes the halos of uhr and vhr because pass_vector might do ! calculations on them, even though they are never used. - !$OMP do - do k=1,nz - do j=jsd,jed ; do I=IsdB,IedB ; uhr(I,j,k) = 0.0 ; enddo ; enddo - do J=jsdB,jedB ; do i=Isd,Ied ; vhr(i,J,k) = 0.0 ; enddo ; enddo - do j=jsd,jed ; do i=Isd,Ied ; hprev(i,j,k) = 0.0 ; enddo ; enddo + do concurrent (k=1:nz) + do concurrent (j=jsd:jed, I=IsdB:IedB) + uhr(I,j,k) = 0.0 + enddo + do concurrent (J=jsdB:jedB, i=Isd:Ied) + vhr(i,J,k) = 0.0 + enddo + do concurrent (j=jsd:jed, i=Isd:Ied) + hprev(i,j,k) = 0.0 + enddo domore_k(k)=1 ! Put the remaining (total) thickness fluxes into uhr and vhr. - do j=js,je ; do I=is-1,ie ; uhr(I,j,k) = uhtr(I,j,k) ; enddo ; enddo - do J=js-1,je ; do i=is,ie ; vhr(i,J,k) = vhtr(i,J,k) ; enddo ; enddo + do concurrent (j=js:je, I=is-1:ie) + uhr(I,j,k) = uhtr(I,j,k) + enddo + do concurrent (J=js-1:je, i=is:ie) + vhr(i,J,k) = vhtr(i,J,k) + enddo if (.not. present(vol_prev)) then ! This loop reconstructs the thickness field the last time that the ! tracers were updated, probably just after the diabatic forcing. A useful ! diagnostic could be to compare this reconstruction with that older value. - do j=js,je ; do i=is,ie + do concurrent (j=js:je, i=is:ie) hprev(i,j,k) = max(0.0, G%areaT(i,j)*h_end(i,j,k) + & ((uhr(I,j,k) - uhr(I-1,j,k)) + (vhr(i,J,k) - vhr(i,J-1,k)))) ! In the case that the layer is now dramatically thinner than it was previously, @@ -195,41 +213,56 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! non-conservation of tracers hprev(i,j,k) = hprev(i,j,k) + & max(0.0, 1.0e-13*hprev(i,j,k) - G%areaT(i,j)*h_end(i,j,k)) - enddo ; enddo + enddo else - do j=js,je ; do i=is,ie + do concurrent (j=js:je, i=is:ie) hprev(i,j,k) = vol_prev(i,j,k) - enddo ; enddo + enddo endif enddo - - !$OMP do - do j=jsd,jed ; do I=isd,ied-1 + do concurrent (j=jsd:jed, I=isd:ied-1) uh_neglect(I,j) = GV%H_subroundoff * MIN(G%areaT(i,j), G%areaT(i+1,j)) - enddo ; enddo - !$OMP do - do J=jsd,jed-1 ; do i=isd,ied + enddo + do concurrent (J=jsd:jed-1, i=isd:ied) vh_neglect(i,J) = GV%H_subroundoff * MIN(G%areaT(i,j), G%areaT(i,j+1)) - enddo ; enddo + enddo ! initialize diagnostic fluxes and tendencies - !$OMP do - do m=1,ntr - if (associated(Reg%Tr(m)%ad_x)) Reg%Tr(m)%ad_x(:,:,:) = 0.0 - if (associated(Reg%Tr(m)%ad_y)) Reg%Tr(m)%ad_y(:,:,:) = 0.0 - if (associated(Reg%Tr(m)%advection_xy)) Reg%Tr(m)%advection_xy(:,:,:) = 0.0 - if (associated(Reg%Tr(m)%ad2d_x)) Reg%Tr(m)%ad2d_x(:,:) = 0.0 - if (associated(Reg%Tr(m)%ad2d_y)) Reg%Tr(m)%ad2d_y(:,:) = 0.0 + do concurrent (m=1:ntr) + if (associated(Reg%Tr(m)%ad_x)) then + do concurrent (k=1:nz, j=jsd:jed, I=IsdB:IedB) + Reg%Tr(m)%ad_x(i,j,k) = 0.0 + enddo + endif + if (associated(Reg%Tr(m)%ad_y)) then + do concurrent (k=1:nz, j=JsdB:jedB, i=isd:ied) + Reg%Tr(m)%ad_y(i,j,k) = 0.0 + enddo + endif + if (associated(Reg%Tr(m)%advection_xy)) then + do concurrent (k=1:nz, j=jsd:jed, i=isd:ied) + Reg%Tr(m)%advection_xy(i,j,k) = 0.0 + enddo + endif + if (associated(Reg%Tr(m)%ad2d_x)) then + do concurrent (j=jsd:jed, I=IsdB:IedB) + Reg%Tr(m)%ad2d_x(i,j) = 0.0 + enddo + endif + if (associated(Reg%Tr(m)%ad2d_y)) then + do concurrent (J=JsdB:JedB, i=isd:ied) + Reg%Tr(m)%ad2d_y(i,j) = 0.0 + enddo + endif enddo - !$OMP end parallel isv = is ; iev = ie ; jsv = js ; jev = je do itt=1,max_iter if (isv > is-stencil) then - call do_group_pass(CS%pass_uhr_vhr_t_hprev, G%Domain, clock=id_clock_pass) + call do_group_pass(CS%pass_uhr_vhr_t_hprev, G%Domain, clock=id_clock_pass, omp_offload=.true.) nsten_halo = min(is-isd,ied-ie,js-jsd,jed-je)/stencil isv = is-nsten_halo*stencil ; jsv = js-nsten_halo*stencil @@ -237,26 +270,29 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! Reevaluate domore_u & domore_v unless the valid range is the same size as ! before. Also, do this if there is Strang splitting. if ((nsten_halo > 1) .or. (itt==1)) then - !$OMP parallel do default(shared) - do k=1,nz ; if (domore_k(k) > 0) then - do j=jsv,jev ; if (.not.domore_u(j,k)) then + do concurrent (k=1:nz, domore_k(k) > 0) + do concurrent (j=jsv:jev, .not.domore_u(j,k)) do i=isv+stencil-1,iev-stencil ; if (uhr(I,j,k) /= 0.0) then domore_u(j,k) = .true. ; exit endif ; enddo ! i-loop - endif ; enddo - do J=jsv+stencil-1,jev-stencil ; if (.not.domore_v(J,k)) then + enddo + do concurrent (J=jsv+stencil-1:jev-stencil, .not.domore_v(J,k)) do i=isv+stencil,iev-stencil ; if (vhr(i,J,k) /= 0.0) then domore_v(J,k) = .true. ; exit endif ; enddo ! i-loop - endif ; enddo + enddo ! At this point, domore_k is global. Change it so that it indicates ! whether any work is needed on a layer on this processor. domore_k(k) = 0 - do j=jsv,jev ; if (domore_u(j,k)) domore_k(k) = 1 ; enddo - do J=jsv+stencil-1,jev-stencil ; if (domore_v(J,k)) domore_k(k) = 1 ; enddo + do concurrent (j=jsv:jev, domore_u(j,k)) + domore_k(k) = 1 + enddo + do concurrent (J=jsv+stencil-1:jev-stencil, domore_v(J,k)) + domore_k(k) = 1 + enddo - endif ; enddo ! k-loop + enddo ! k-loop endif endif @@ -270,16 +306,8 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! for all the transport to happen. The sum over domore_k keeps the processors ! synchronized. This may not be very efficient, but it should be reliable. - !!$OMP parallel default(shared) - !$omp target enter data map(to: hprev, uhr, vhr, uh_neglect, vh_neglect, domore_u, domore_v, local_advect_scheme, Reg%Tr(:), OBC) - - !$ do m = 1, ntr - !$omp target enter data map(to: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) - !$ enddo - if (x_first) then - !$OMP do ordered do k=1,nz ; if (domore_k(k) > 0) then ! First, advect zonally. call advect_x(Reg%Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & @@ -287,22 +315,26 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first local_advect_scheme) endif ; enddo - !$OMP do ordered do k=1,nz ; if (domore_k(k) > 0) then ! Next, advect meridionally. call advect_y(Reg%Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & isv, iev, jsv, jev, k, G, GV, US, local_advect_scheme) ! Update domore_k(k) for the next iteration + !$omp target domore_k(k) = 0 - do concurrent (j=jsv-stencil:jev+stencil, domore_u(j,k)) ; domore_k(k) = 1 ; enddo - do concurrent (J=jsv-1:jev, domore_v(J,k)) ; domore_k(k) = 1 ; enddo + !$omp end target + do concurrent (j=jsv-stencil:jev+stencil, domore_u(j,k)) + domore_k(k) = 1 + enddo + do concurrent (J=jsv-1:jev, domore_v(J,k)) + domore_k(k) = 1 + enddo endif ; enddo else - !$OMP do ordered do k=1,nz ; if (domore_k(k) > 0) then ! First, advect meridionally. call advect_y(Reg%Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & @@ -310,28 +342,25 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first local_advect_scheme) endif ; enddo - !$OMP do ordered do k=1,nz ; if (domore_k(k) > 0) then ! Next, advect zonally. call advect_x(Reg%Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & isv, iev, jsv, jev, k, G, GV, US, local_advect_scheme) ! Update domore_k(k) for the next iteration + !$omp target domore_k(k) = 0 - do concurrent (j=jsv:jev, domore_u(j,k)) ; domore_k(k) = 1 ; enddo - do concurrent (J=jsv-1:jev, domore_v(J,k)) ; domore_k(k) = 1 ; enddo + !$omp end target + do concurrent (j=jsv:jev, domore_u(j,k)) + domore_k(k) = 1 + enddo + do concurrent (J=jsv-1:jev, domore_v(J,k)) + domore_k(k) = 1 + enddo endif ; enddo endif ! x_first - ! transfer Tr(m) members before releasing Tr array - !$ do m = 1, ntr - !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) - !$ enddo - - !$omp target exit data map(from: uhr, vhr, hprev) map(release: uh_neglect, vh_neglect, domore_u, domore_v, local_advect_scheme, Reg%tr(:), OBC) - !!$OMP end parallel - ! If the advection just isn't finishing after max_iter, move on. if (itt >= max_iter) then exit @@ -352,10 +381,29 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first enddo ! Iterations loop - if (present(uhr_out)) uhr_out(:,:,:) = uhr(:,:,:) - if (present(vhr_out)) vhr_out(:,:,:) = vhr(:,:,:) + ! transfer Tr(m) members before releasing Tr array + !$ do m = 1, ntr + !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) + !$ enddo + + !$omp target exit data map(from: uhr, vhr, hprev) map(release: CS, uh_neglect, vh_neglect, domore_u, domore_v, local_advect_scheme, Reg, Reg%tr(:), OBC) + + if (present(uhr_out)) then + do concurrent (k=1:nz, j=jsd:jed, i=isdB:iedB) + uhr_out(i,j,k) = uhr(i,j,k) + enddo + endif + if (present(vhr_out)) then + do concurrent (k=1:nz, j=jsdB:jedB, i=isd:ied) + vhr_out(i,j,k) = vhr(i,j,k) + enddo + endif if (present(vol_prev) .and. present(update_vol_prev)) then - if (update_vol_prev) vol_prev(:,:,:) = hprev(:,:,:) + if (update_vol_prev) then + do concurrent (k=1:nz, j=jsd:jed, i=isd:ied) + vol_prev(i,j,k) = hprev(i,j,k) + enddo + endif endif call cpu_clock_end(id_clock_advect) @@ -505,7 +553,8 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & endif endif enddo - do concurrent (m = 1:ntr, i=OBC%segment(n)%HI%IsdB-1:OBC%segment(n)%HI%IsdB+1) ! Apply update tracer values for slope calculation + ! Apply update tracer values for slope calculation + do concurrent (m = 1:ntr, i=OBC%segment(n)%HI%IsdB-1:OBC%segment(n)%HI%IsdB+1) Tp = T_tmp(i+1,m) ; Tc = T_tmp(i,m) ; Tm = T_tmp(i-1,m) dMx = max( Tp, Tc, Tm ) - Tc dMn= Tc - min( Tp, Tc, Tm ) @@ -719,18 +768,20 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & enddo ! diagnostics - if (associated(Tr(m)%ad_x)) then ; do concurrent (I=is-1:ie) ; if (do_i(i,j) .or. do_i(i+1,j)) then - Tr(m)%ad_x(I,j,k) = Tr(m)%ad_x(I,j,k) + flux_x(I,j,m)*Idt - endif ; enddo ; endif + if (associated(Tr(m)%ad_x)) then + do concurrent (I=is-1:ie, do_i(i,j) .or. do_i(i+1,j)) + Tr(m)%ad_x(I,j,k) = Tr(m)%ad_x(I,j,k) + flux_x(I,j,m)*Idt + enddo + endif ! diagnose convergence of flux_x (do not use the Ihnew(i) part of the logic). ! division by areaT to get into W/m2 for heat and kg/(s*m2) for salt. if (associated(Tr(m)%advection_xy)) then - do concurrent (i=is:ie) ; if (do_i(i,j)) then + do concurrent (i=is:ie, do_i(i,j)) Tr(m)%advection_xy(i,j,k) = Tr(m)%advection_xy(i,j,k) - & (flux_x(I,j,m) - flux_x(I-1,j,m)) * & Idt * G%IareaT(i,j) - endif ; enddo + enddo endif enddo @@ -738,23 +789,21 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & endif ; enddo ! End of j-loop. ! Do user controlled underflow of the tracer concentrations. - do concurrent (m=1:ntr) ; if (Tr(m)%conc_underflow > 0.0) then + do concurrent (m=1:ntr, Tr(m)%conc_underflow > 0.0) do concurrent (j=js:je, i=is:ie) if (abs(Tr(m)%t(i,j,k)) < Tr(m)%conc_underflow) Tr(m)%t(i,j,k) = 0.0 enddo - endif ; enddo + enddo ! compute ad2d_x diagnostic outside above j-loop so as to make the summation ordered when OMP is active. - !$OMP ordered do m=1,ntr ; if (associated(Tr(m)%ad2d_x)) then - do concurrent (j=js:je) ; if (domore_u_initial(j,k)) then - do concurrent (I=is-1:ie) ; if (do_i(i,j) .or. do_i(i+1,j)) then + do concurrent (j=js:je, domore_u_initial(j,k)) + do concurrent (I=is-1:ie, do_i(i,j) .or. do_i(i+1,j)) Tr(m)%ad2d_x(I,j) = Tr(m)%ad2d_x(I,j) + flux_x(I,j,m)*Idt - endif ; enddo - endif ; enddo + enddo + enddo endif ; enddo ! End of m-loop. - !$OMP end ordered !$omp target exit data & !$omp map(release: slope_x, T_tmp, uhh, CFL, hlst, Ihnew, do_i, flux_x) @@ -852,7 +901,9 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & do_j_tr(j) = .false. enddo do concurrent (J=js-1:je, domore_v(J,k)) - do concurrent (j2=1-stencil:stencil) ; do_j_tr(j+j2) = .true. ; enddo + do concurrent (j2=1-stencil:stencil) + do_j_tr(j+j2) = .true. + enddo enddo do concurrent (j=SZJB_(G)) domore_v_initial(j) = domore_v(j,k) @@ -861,27 +912,29 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & ! Calculate the j-direction profiles (slopes) of each tracer that ! is being advected. if (usePLMslope) then - do concurrent (j=js-stencil:je+stencil, do_j_tr(j)) ; do concurrent (m=1:ntr, i=is:ie) - !if (ABS(Tr(m)%t(i,j+1,k)-Tr(m)%t(i,j,k)) < & - ! ABS(Tr(m)%t(i,j,k)-Tr(m)%t(i,j-1,k))) then - ! maxslope = 4.0*(Tr(m)%t(i,j+1,k)-Tr(m)%t(i,j,k)) - !else - ! maxslope = 4.0*(Tr(m)%t(i,j,k)-Tr(m)%t(i,j-1,k)) - !endif - !if ((Tr(m)%t(i,j+1,k)-Tr(m)%t(i,j,k))*(Tr(m)%t(i,j,k)-Tr(m)%t(i,j-1,k)) < 0.0) then - ! slope_y(i,m,j) = 0.0 - !elseif (ABS(Tr(m)%t(i,j+1,k)-Tr(m)%t(i,j-1,k)) 0 .or. OBC%segnum_v(i,J) < 0) ! OBC_DIRECTION_N or OBC_DIRECTION_S + ! OBC_DIRECTION_N or OBC_DIRECTION_S + do concurrent (i=is:ie, OBC%segnum_v(i,J-1) > 0 .or. OBC%segnum_v(i,J) < 0) do_i(i,j) = .false. enddo endif @@ -1157,7 +1215,6 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & enddo ! compute ad_y and ad2d_y diagnostic outside above j-loop so as to make the summation ordered when OMP is active. - !$OMP ordered do m=1,ntr ; if (associated(Tr(m)%ad_y)) then do concurrent (J=js-1:je, domore_v_initial(J)) do concurrent (i=is:ie, do_i(i,j) .or. do_i(i,j+1)) @@ -1173,7 +1230,6 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & enddo enddo endif ; enddo ! End of m-loop. - !$OMP end ordered !$omp target exit data map(release: vhh, T_tmp, slope_y, flux_y, domore_v_initial, do_j_tr, do_i) From bfcf100156cc3d20fd374234691705301bc902f1 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 19 Nov 2025 12:15:02 +1100 Subject: [PATCH 6/7] improve tracer advection data transfers Couldn't safely init tracers in MOM_tracer_registry without changing answers. The number of transfers end up being the same since tracers need to be defensively updated to host even if they were persistent on device. --- src/tracer/MOM_tracer_advect.F90 | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index 56dc00a288..3c73d4e050 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -116,7 +116,8 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first !$omp target update to(uhtr, vhtr, h_end) - !$omp target enter data map(to: Reg, Reg%Tr(:), CS, OBC) map(alloc: domore_u, domore_v, uhr, vhr, uh_neglect, vh_neglect, hprev) + !$omp target enter data map(to: OBC) map(alloc: domore_u, domore_v, uhr, vhr, uh_neglect, & + !$omp vh_neglect, hprev, domore_k, local_advect_scheme, Reg, Reg%Tr(:)) do concurrent (k=1:GV%ke, j=G%jsd:G%jed) domore_u(j,k) = .false. @@ -161,6 +162,8 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first stencil = max(stencil, stencil_local) enddo + !$omp target update from(local_advect_scheme) + if (min(is-isd,ied-ie,js-jsd,jed-je) < stencil) then call MOM_error(FATAL, "MOM_tracer_advect: "//& "stencil is wider than the halo.") @@ -175,11 +178,10 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first call create_group_pass(CS%pass_uhr_vhr_t_hprev, hprev, G%Domain) do m=1,ntr call create_group_pass(CS%pass_uhr_vhr_t_hprev, Reg%Tr(m)%t, G%Domain) - !$omp target enter data map(to: Reg%Tr(m)%t) map(alloc: Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) enddo call cpu_clock_end(id_clock_pass) - !$omp target enter data map(to: local_advect_scheme) + !!$omp target enter data map(to: local_advect_scheme) ! This initializes the halos of uhr and vhr because pass_vector might do ! calculations on them, even though they are never used. @@ -228,6 +230,14 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first vh_neglect(i,J) = GV%H_subroundoff * MIN(G%areaT(i,j), G%areaT(i,j+1)) enddo + ! update GPU copy of Tr(:)%t + ! only update t because other members are zeroed + !$ do m=1,ntr + !$omp target enter data map(to: Reg%Tr(m)%t) & + !$omp map(alloc: Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_x, Reg%Tr(m)%ad2d_y, & + !$omp Reg%Tr(m)%advection_xy) + !$ enddo + ! initialize diagnostic fluxes and tendencies do concurrent (m=1:ntr) if (associated(Reg%Tr(m)%ad_x)) then @@ -368,6 +378,7 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first ! Exit if there are no layers that need more iterations. if (isv > is-stencil) then + !$omp target update from(domore_k) do_any = 0 call cpu_clock_begin(id_clock_sync) call sum_across_PEs(domore_k(:), nz) @@ -381,13 +392,11 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first enddo ! Iterations loop - ! transfer Tr(m) members before releasing Tr array !$ do m = 1, ntr - !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) + !$omp target exit data map(from: Reg%Tr(m)%t, Reg%Tr(m)%ad_x, Reg%Tr(m)%ad_y, Reg%Tr(m)%ad2d_x, & + !$omp Reg%Tr(m)%ad2d_y, Reg%Tr(m)%advection_xy) !$ enddo - !$omp target exit data map(from: uhr, vhr, hprev) map(release: CS, uh_neglect, vh_neglect, domore_u, domore_v, local_advect_scheme, Reg, Reg%tr(:), OBC) - if (present(uhr_out)) then do concurrent (k=1:nz, j=jsd:jed, i=isdB:iedB) uhr_out(i,j,k) = uhr(i,j,k) @@ -406,6 +415,9 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first endif endif + !$omp target exit data map(from: hprev) map(release: uhr, vhr, uh_neglect, vh_neglect, domore_u, & + !$omp domore_v, local_advect_scheme, OBC, domore_k, Reg, Reg%Tr(:)) + call cpu_clock_end(id_clock_advect) end subroutine advect_tracer @@ -1279,6 +1291,8 @@ subroutine tracer_advect_init(Time, G, US, param_file, diag, CS) default=.false.) endif + !$omp target enter data map(to: CS) + id_clock_advect = cpu_clock_id('(Ocean advect tracer)', grain=CLOCK_MODULE) id_clock_pass = cpu_clock_id('(Ocean tracer halo updates)', grain=CLOCK_ROUTINE) id_clock_sync = cpu_clock_id('(Ocean tracer global synch)', grain=CLOCK_ROUTINE) @@ -1289,6 +1303,7 @@ end subroutine tracer_advect_init subroutine tracer_advect_end(CS) type(tracer_advect_CS), pointer :: CS !< module control structure + !$omp target exit data map(delete: CS) if (associated(CS)) deallocate(CS) end subroutine tracer_advect_end From b76e03fe3a3aaf30b682f47e23a0e6f9120a9bcc Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 27 Mar 2026 17:40:17 +1100 Subject: [PATCH 7/7] fix present table errors initial allocs and loops put after the early return also adds DO_LOCALITY where needed --- src/tracer/MOM_tracer_advect.F90 | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/tracer/MOM_tracer_advect.F90 b/src/tracer/MOM_tracer_advect.F90 index 3c73d4e050..4ff8f83b39 100644 --- a/src/tracer/MOM_tracer_advect.F90 +++ b/src/tracer/MOM_tracer_advect.F90 @@ -2,6 +2,8 @@ ! See the LICENSE file for licensing information. ! SPDX-License-Identifier: Apache-2.0 +#include + !> This module contains the subroutines that advect tracers along coordinate surfaces. module MOM_tracer_advect @@ -114,17 +116,6 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first integer :: stencil_local ! Stencil for the local adection scheme integer :: local_advect_scheme(Reg%ntr) ! contains the list of the advection for each tracer - !$omp target update to(uhtr, vhtr, h_end) - - !$omp target enter data map(to: OBC) map(alloc: domore_u, domore_v, uhr, vhr, uh_neglect, & - !$omp vh_neglect, hprev, domore_k, local_advect_scheme, Reg, Reg%Tr(:)) - - do concurrent (k=1:GV%ke, j=G%jsd:G%jed) - domore_u(j,k) = .false. - enddo - do concurrent (k=1:GV%ke, j=G%jsdB:G%jedB) - domore_v(j,k) = .false. - enddo is = G%isc ; ie = G%iec ; js = G%jsc ; je = G%jec ; nz = GV%ke isd = G%isd ; ied = G%ied ; jsd = G%jsd ; jed = G%jed IsdB = G%IsdB ; IedB = G%IedB ; JsdB = G%JsdB ; JedB = G%JedB @@ -139,6 +130,19 @@ subroutine advect_tracer(h_end, uhtr, vhtr, OBC, dt, G, GV, US, CS, Reg, x_first if (.not. associated(Reg)) call MOM_error(FATAL, "MOM_tracer_advect: "// & "register_tracer must be called before advect_tracer.") if (Reg%ntr==0) return + + !$omp target update to(uhtr, vhtr, h_end) + + !$omp target enter data map(to: OBC) map(alloc: domore_u, domore_v, uhr, vhr, uh_neglect, & + !$omp vh_neglect, hprev, domore_k, local_advect_scheme, Reg, Reg%Tr(:)) + + do concurrent (k=1:nz, j=jsd:jed) + domore_u(j,k) = .false. + enddo + do concurrent (k=1:nz, j=jsdB:jedB) + domore_v(j,k) = .false. + enddo + call cpu_clock_begin(id_clock_advect) x_first = (MOD(G%first_direction,2) == 0) @@ -584,7 +588,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & ! the minimum of the remaining mass flux (uhr) and the half the mass ! in the cell plus whatever part of its half of the mass flux that ! the flux through the other side does not require. - do concurrent (I=is-1:ie) reduce(.or.:domore_u_jk) + do concurrent (I=is-1:ie) DO_LOCALITY(reduce(.or.:domore_u_jk)) if ((uhr(I,j,k) == 0.0) .or. & ((uhr(I,j,k) < 0.0) .and. (hprev(i+1,j,k) <= tiny_h)) .or. & ((uhr(I,j,k) > 0.0) .and. (hprev(i,j,k) <= tiny_h)) ) then @@ -614,7 +618,7 @@ subroutine advect_x(Tr, hprev, uhr, uh_neglect, OBC, domore_u, ntr, Idt, & CFL(I) = uhh(I) / (hprev(i,j,k)) ! CFL is positive endif enddo - + domore_u(j,k) = domore_u_jk do concurrent (m=1:ntr) @@ -1006,7 +1010,7 @@ subroutine advect_y(Tr, hprev, vhr, vh_neglect, OBC, domore_v, ntr, Idt, & do J=js-1,je ; if (domore_v(J,k)) then domore_v_jk = .false. - do concurrent (i=is:ie) reduce(.or.:domore_v_jk) + do concurrent (i=is:ie) DO_LOCALITY(reduce(.or.:domore_v_jk)) if ((vhr(i,J,k) == 0.0) .or. & ((vhr(i,J,k) < 0.0) .and. (hprev(i,j+1,k) <= tiny_h)) .or. & ((vhr(i,J,k) > 0.0) .and. (hprev(i,j,k) <= tiny_h)) ) then