diff --git a/src/core/MOM.F90 b/src/core/MOM.F90
index b363da01cb..d07011057a 100644
--- a/src/core/MOM.F90
+++ b/src/core/MOM.F90
@@ -666,7 +666,7 @@ subroutine step_MOM(forces_in, fluxes_in, sfc_state, Time_start, time_int_in, CS
       call set_derived_forcing_fields(forces, fluxes, G, US, GV%Rho0)
     endif
   endif
-  !$omp target enter data map(to: forces, forces%taux, forces%tauy)
+  !$omp target enter data map(to: forces, forces%taux, forces%tauy, forces%ustar)
 
   ! This will be replaced later with the pressures from forces or fluxes if they are available.
   if (associated(CS%tv%p_surf)) CS%tv%p_surf(:,:) = 0.0
@@ -726,7 +726,7 @@ subroutine step_MOM(forces_in, fluxes_in, sfc_state, Time_start, time_int_in, CS
     if (nonblocking_p_surf_update) then
       call start_group_pass(pass_tau_ustar_psurf, G%Domain)
     else
-      call do_group_pass(pass_tau_ustar_psurf, G%Domain)
+      call do_group_pass(pass_tau_ustar_psurf, G%Domain, omp_offload=.true.)
     endif
     call cpu_clock_end(id_clock_pass)
 
diff --git a/src/core/MOM_PressureForce_FV.F90 b/src/core/MOM_PressureForce_FV.F90
index 057d514b60..2d34d80e6e 100644
--- a/src/core/MOM_PressureForce_FV.F90
+++ b/src/core/MOM_PressureForce_FV.F90
@@ -1363,9 +1363,11 @@ subroutine PressureForce_FV_Bouss(h, tv, PFu, PFv, G, GV, US, CS, ALE_CSp, ADp,
   endif
 
   ! Set the pressure anomalies at the interfaces.
-  do k=1,nz
-    do concurrent (j=Jsq:Jeq+1, i=Isq:Ieq+1)
-      pa(i,j,K+1) = pa(i,j,K) + dpa(i,j,k)
+  do concurrent (j=Jsq:Jeq+1)
+    do k=1,nz
+      do concurrent (i=Isq:Ieq+1)
+        pa(i,j,K+1) = pa(i,j,K) + dpa(i,j,k)
+      enddo
     enddo
   enddo
 
@@ -1565,32 +1567,24 @@ subroutine PressureForce_FV_Bouss(h, tv, PFu, PFv, G, GV, US, CS, ALE_CSp, ADp,
     ! assuming that the surface pressure anomaly varies linearly in x and y.
     ! If there is an ice-shelf or icebergs, this linear variation would need to be applied
     ! to an interior interface.
-    !$omp target
-    !$omp parallel loop collapse(2)
-    do j=js,je ; do I=Isq,Ieq
+    do concurrent (j=js:je, I=Isq:Ieq)
       intx_pa(I,j,1) = 0.5*(pa(i,j,1) + pa(i+1,j,1))
-    enddo ; enddo
-    !$omp parallel loop collapse(2)
-    do J=Jsq,Jeq ; do i=is,ie
+    enddo
+    do concurrent (J=Jsq:Jeq, i=is:ie)
       inty_pa(i,J,1) = 0.5*(pa(i,j,1) + pa(i,j+1,1))
-    enddo ; enddo
-    !$omp end target
+    enddo
   endif
 
-  !$omp target
   do k=1,nz
-    !$omp parallel loop collapse(2)
-    do j=js,je ; do I=Isq,Ieq
+    do concurrent (j=js:je, I=Isq:Ieq)
       intx_pa(I,j,K+1) = intx_pa(I,j,K) + intx_dpa(I,j,k)
-    enddo ; enddo
+    enddo
   enddo
   do k=1,nz
-    !$omp parallel loop collapse(2)
-    do J=Jsq,Jeq ; do i=is,ie
+    do concurrent (J=Jsq:Jeq, i=is:ie)
       inty_pa(i,J,K+1) = inty_pa(i,J,K) + inty_dpa(i,J,k)
-    enddo ; enddo
+    enddo
   enddo
-  !$omp end target
 
   if (CS%reset_intxpa_integral) then
     ! Having stored the pressure gradient info, we can work out where the first nonvanished layers is
@@ -1930,13 +1924,11 @@ subroutine PressureForce_FV_Bouss(h, tv, PFu, PFv, G, GV, US, CS, ALE_CSp, ADp,
       enddo
     endif
 
-    do k=1,nz
-      do concurrent (j=js:je, I=Isq:Ieq)
-        PFu(I,j,k) = PFu(I,j,k) - (dM(i+1,j) - dM(i,j)) * G%IdxCu(I,j)
-      enddo
-      do concurrent (J=Jsq:Jeq, i=is:ie)
-        PFv(i,J,k) = PFv(i,J,k) - (dM(i,j+1) - dM(i,j)) * G%IdyCv(i,J)
-      enddo
+    do concurrent (k=1:nz, j=js:je, I=Isq:Ieq)
+      PFu(I,j,k) = PFu(I,j,k) - (dM(i+1,j) - dM(i,j)) * G%IdxCu(I,j)
+    enddo
+    do concurrent (k=1:nz, J=Jsq:Jeq, i=is:ie)
+      PFv(i,J,k) = PFv(i,J,k) - (dM(i,j+1) - dM(i,j)) * G%IdyCv(i,J)
     enddo
 
     !$omp end target data
diff --git a/src/core/MOM_PressureForce_Montgomery.F90 b/src/core/MOM_PressureForce_Montgomery.F90
index 6e148ee642..74ae2b9af5 100644
--- a/src/core/MOM_PressureForce_Montgomery.F90
+++ b/src/core/MOM_PressureForce_Montgomery.F90
@@ -689,17 +689,19 @@ subroutine Set_pbce_Bouss(e, tv, G, GV, US, Rho0, GFS_scale, pbce, rho_star)
 
   if (use_EOS) then
     if (present(rho_star)) then
-      do concurrent (j=Jsq:Jeq+1, i=Isq:Ieq+1)
+      do concurrent (j=Jsq:Jeq+1)
+      do concurrent (i=Isq:Ieq+1)
         Ihtot(i,j) = GV%H_to_Z / ((e(i,j,1) - e(i,j,nz+1)) + dz_neglect)
         pbce(i,j,1) = GFS_scale * rho_star(i,j,1) * GV%H_to_Z
       enddo
 
       do k=2,nz
-        do concurrent (j=Jsq:Jeq+1, i=Isq:Ieq+1)
+        do concurrent (i=Isq:Ieq+1)
           pbce(i,j,k) = pbce(i,j,k-1) + (rho_star(i,j,k) - rho_star(i,j,k-1)) &
               * ((e(i,j,K) - e(i,j,nz+1)) * Ihtot(i,j))
         enddo
       enddo
+    enddo
     else
       !$omp target data &
       !$omp   map(alloc: EOSdom, press, T_int, S_int, rho_in_situ) &
@@ -743,14 +745,16 @@ subroutine Set_pbce_Bouss(e, tv, G, GV, US, Rho0, GFS_scale, pbce, rho_star)
       !$omp end target data
     endif
   else
-    do concurrent (j=Jsq:Jeq+1, i=Isq:Ieq+1)
-      Ihtot(i,j) = 1.0 / ((e(i,j,1) - e(i,j,nz+1)) + dz_neglect)
-      pbce(i,j,1) = GV%g_prime(1) * GV%H_to_Z
-    enddo
-    do k=2,nz
-      do concurrent (j=Jsq:Jeq+1, i=Isq:Ieq+1)
-        pbce(i,j,k) = pbce(i,j,k-1) + (GV%g_prime(K) * GV%H_to_Z) &
-            * ((e(i,j,K) - e(i,j,nz+1)) * Ihtot(i,j))
+    do concurrent (j=Jsq:Jeq+1)
+      do concurrent (i=Isq:Ieq+1)
+        Ihtot(i,j) = 1.0 / ((e(i,j,1) - e(i,j,nz+1)) + dz_neglect)
+        pbce(i,j,1) = GV%g_prime(1) * GV%H_to_Z
+      enddo
+      do k=2,nz
+        do concurrent (i=Isq:Ieq+1)
+          pbce(i,j,k) = pbce(i,j,k-1) + (GV%g_prime(K) * GV%H_to_Z) &
+              * ((e(i,j,K) - e(i,j,nz+1)) * Ihtot(i,j))
+        enddo
       enddo
     enddo
   endif
diff --git a/src/core/MOM_barotropic.F90 b/src/core/MOM_barotropic.F90
index 2448b2d051..10adc0a492 100644
--- a/src/core/MOM_barotropic.F90
+++ b/src/core/MOM_barotropic.F90
@@ -1068,27 +1068,39 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
   do concurrent (J=js-1:je, i=is-1:ie+1)
     vbt_Cor(i,J) = 0.0
   enddo
-  do k=1,nz ; do concurrent (j=js:je, I=is-1:ie)
-    ubt_Cor(I,j) = ubt_Cor(I,j) + wt_u(I,j,k) * U_Cor(I,j,k)
-  enddo ; enddo
-  do k=1,nz ; do concurrent (J=js-1:je, i=is:ie)
-    vbt_Cor(i,J) = vbt_Cor(i,J) + wt_v(i,J,k) * V_Cor(i,J,k)
-  enddo ; enddo
+  do concurrent (j=js:je)
+    do k=1,nz
+      do concurrent (I=is-1:ie)
+        ubt_Cor(I,j) = ubt_Cor(I,j) + wt_u(I,j,k) * U_Cor(I,j,k)
+      enddo
+    enddo
+  enddo
+  do concurrent (J=js-1:je)
+    do k=1,nz
+      do concurrent (i=is:ie)
+        vbt_Cor(i,J) = vbt_Cor(i,J) + wt_v(i,J,k) * V_Cor(i,J,k)
+      enddo
+    enddo
+  enddo
 
   ! The gtot arrays are the effective layer-weighted reduced gravities for
   ! accelerations across the various faces, with names for the relative
   ! locations of the faces to the pressure point.  They will have their halos
   ! updated later on.
-  do k=1,nz
-    do concurrent (j=js:je, i=is-1:ie)
-      gtot_E(i,j)   = gtot_E(i,j)   + pbce(i,j,k)   * wt_u(I,j,k)
-      gtot_W(i+1,j) = gtot_W(i+1,j) + pbce(i+1,j,k) * wt_u(I,j,k)
+  do concurrent (j=js:je)
+    do k=1,nz
+      do concurrent (i=is-1:ie)
+        gtot_E(i,j)   = gtot_E(i,j)   + pbce(i,j,k)   * wt_u(I,j,k)
+        gtot_W(i+1,j) = gtot_W(i+1,j) + pbce(i+1,j,k) * wt_u(I,j,k)
+      enddo
     enddo
   enddo
-  do k=1,nz
-    do concurrent (J=js-1:je, i=is:ie)
-      gtot_N(i,j)   = gtot_N(i,j)   + pbce(i,j,k)   * wt_v(i,J,k)
-      gtot_S(i,j+1) = gtot_S(i,j+1) + pbce(i,j+1,k) * wt_v(i,J,k)
+  do concurrent (J=js-1:je)
+    do k=1,nz
+      do concurrent (i=is:ie)
+        gtot_N(i,j)   = gtot_N(i,j)   + pbce(i,j,k)   * wt_v(i,J,k)
+        gtot_S(i,j+1) = gtot_S(i,j+1) + pbce(i,j+1,k) * wt_v(i,J,k)
+      enddo
     enddo
   enddo
 
@@ -1172,14 +1184,22 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
         vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * v_vh0(i,J,k)
       enddo ; enddo
     else
-      do k=1,nz ; do concurrent (j=js:je, I=is-1:ie)
-        uhbt(I,j) = uhbt(I,j) + uh0(I,j,k)
-        ubt(I,j) = ubt(I,j) + CS%frhatu(I,j,k) * u_uh0(I,j,k)
-      enddo ; enddo
-      do k=1,nz ; do concurrent (J=js-1:je, i=is:ie)
-        vhbt(i,J) = vhbt(i,J) + vh0(i,J,k)
-        vbt(i,J) = vbt(i,J) + CS%frhatv(i,J,k) * v_vh0(i,J,k)
-      enddo ; enddo
+      do concurrent (j=js:je)
+        do k=1,nz
+          do concurrent (I=is-1:ie)
+            uhbt(I,j) = uhbt(I,j) + uh0(I,j,k)
+            ubt(I,j) = ubt(I,j) + CS%frhatu(I,j,k) * u_uh0(I,j,k)
+          enddo
+        enddo
+      enddo
+      do concurrent (J=js-1:je)
+        do k=1,nz
+          do concurrent (i=is:ie)
+            vhbt(i,J) = vhbt(i,J) + vh0(i,J,k)
+            vbt(i,J) = vbt(i,J) + CS%frhatv(i,J,k) * v_vh0(i,J,k)
+          enddo
+        enddo
+      enddo
     endif
     if ((use_BT_cont .or. integral_BT_cont) .and. CS%adjust_BT_cont) then
       ! Use the additional input transports to broaden the fits
@@ -1324,12 +1344,20 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
 
   ! bc_accel_u & bc_accel_v are only available on the potentially
   ! non-symmetric computational domain.
-  do k=1,nz ; do concurrent (j=js:je, I=Isq:Ieq)
-    BT_force_u(I,j) = BT_force_u(I,j) + wt_u(I,j,k) * bc_accel_u(I,j,k)
-  enddo ; enddo
-  do k=1,nz ; do concurrent (J=Jsq:Jeq, i=is:ie)
-    BT_force_v(i,J) = BT_force_v(i,J) + wt_v(i,J,k) * bc_accel_v(i,J,k)
-  enddo ; enddo
+  do concurrent (j=js:je)
+    do k=1,nz
+      do concurrent (I=Isq:Ieq)
+        BT_force_u(I,j) = BT_force_u(I,j) + wt_u(I,j,k) * bc_accel_u(I,j,k)
+      enddo
+    enddo
+  enddo
+  do concurrent (J=Jsq:Jeq)
+    do k=1,nz
+      do concurrent (i=is:ie)
+        BT_force_v(i,J) = BT_force_v(i,J) + wt_v(i,J,k) * bc_accel_v(i,J,k)
+      enddo
+    enddo
+  enddo
 
   if (CS%gradual_BT_ICs) then
     do concurrent (j=js:je, I=is-1:ie)
@@ -1479,15 +1507,23 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
   do concurrent (j=js-1:je+1, I=is-1:ie)
     av_rem_u(I,j) = 0.0
   enddo
-  do concurrent (J=js-1:je, i=is-1:ie+1)
-    av_rem_v(i,J) = 0.0
+  do concurrent (j=js:je)
+    do k=1,nz
+      do concurrent (I=is-1:ie)
+        av_rem_u(I,j) = av_rem_u(I,j) + CS%frhatu(I,j,k) * visc_rem_u(I,j,k)
+      enddo
+    enddo
+  enddo
+  do concurrent (J=js-1:je)
+    do concurrent(i=is-1:ie+1)
+      av_rem_v(i,J) = 0.0
+    enddo
+    do k=1,nz
+      do concurrent (i=is:ie)
+        av_rem_v(i,J) = av_rem_v(i,J) + CS%frhatv(i,J,k) * visc_rem_v(i,J,k)
+      enddo
+    enddo
   enddo
-  do k=1,nz ; do concurrent (j=js:je, I=is-1:ie)
-    av_rem_u(I,j) = av_rem_u(I,j) + CS%frhatu(I,j,k) * visc_rem_u(I,j,k)
-  enddo ; enddo
-  do k=1,nz ; do concurrent (J=js-1:je, i=is:ie)
-    av_rem_v(i,J) = av_rem_v(i,J) + CS%frhatv(i,J,k) * visc_rem_v(i,J,k)
-  enddo ; enddo
   if (CS%strong_drag) then
     do concurrent (j=js:je, I=is-1:ie)
       bt_rem_u(I,j) = G%mask2dCu(I,j) * &
@@ -1815,6 +1851,7 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
   else
     I_sum_wt_vel = 1.0 ; I_sum_wt_eta = 1.0 ; I_sum_wt_accel = 1.0 ; I_sum_wt_trans = 1.0
   endif
+  !$omp target enter data map(to: wt_vel, wt_eta, wt_accel, wt_trans, wt_accel2)
 
   ! March the barotropic solver through all of its time steps.
   call btstep_timeloop(eta, ubt, vbt, uhbt0, Datu, BTCL_u, vhbt0, Datv, BTCL_v, eta_IC, &
@@ -1826,6 +1863,8 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
                 LDv_avg, use_BT_cont, interp_eta_PF, find_etaav, dt, dtbt, nstep, nfilter, &
                 wt_vel, wt_eta, wt_accel, wt_trans, wt_accel2, ADp, CS%BT_OBC, CS, G, MS, GV, US)
 
+  !$omp target exit data map(release: wt_vel, wt_eta, wt_accel, wt_trans, wt_accel2)
+
   if (id_clock_calc > 0) call cpu_clock_end(id_clock_calc)
   if (id_clock_calc_post > 0) call cpu_clock_begin(id_clock_calc_post)
 
@@ -3428,18 +3467,25 @@ subroutine btstep_ubt_from_layer(U_in, V_in, wt_u, wt_v, ubt, vbt,  G, GV, CS)
     vbt(i,j) = 0.0
   enddo
 
-  do k=1,nz ; do concurrent (j=js:je, I=is-1:ie)
-    ubt(I,j) = ubt(I,j) + wt_u(I,j,k) * U_in(I,j,k)
-  enddo ; enddo
-  do k=1,nz ; do concurrent (J=js-1:je, i=is:ie)
-    vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * V_in(i,J,k)
-  enddo ; enddo
-
-  do concurrent (j=js:je, I=is-1:ie)
-    if (abs(ubt(I,j)) < CS%vel_underflow) ubt(I,j) = 0.0
+  do concurrent (j=js:je)
+    do k=1,nz
+      do concurrent (I=is-1:ie)
+        ubt(I,j) = ubt(I,j) + wt_u(I,j,k) * U_in(I,j,k)
+      enddo
+    enddo
+    do concurrent (I=is-1:ie)
+      if (abs(ubt(I,j)) < CS%vel_underflow) ubt(I,j) = 0.0
+    enddo
   enddo
-  do concurrent (J=js-1:je, i=is:ie)
-    if (abs(vbt(i,J)) < CS%vel_underflow) vbt(i,J) = 0.0
+  do concurrent (J=js-1:je)
+    do k=1,nz
+      do concurrent (i=is:ie)
+        vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * V_in(i,J,k)
+      enddo
+    enddo
+    do concurrent (i=is:ie)
+      if (abs(vbt(i,J)) < CS%vel_underflow) vbt(i,J) = 0.0
+    enddo
   enddo
 
 end subroutine btstep_ubt_from_layer
@@ -3604,16 +3650,20 @@ subroutine set_dtbt(G, GV, US, CS, pbce, gtot_est, BT_cont, eta, SSH_add)
     dgeo_de = 1.0 + max(0.0, CS%G_extra - det_de)
   endif
   if (present(pbce)) then
-    do concurrent (j=js:je, i=is:ie)
-      gtot_E(i,j) = 0.0 ; gtot_W(i,j) = 0.0
-      gtot_N(i,j) = 0.0 ; gtot_S(i,j) = 0.0
+    do concurrent (j=js:je)
+      do concurrent (i=is:ie)
+        gtot_E(i,j) = 0.0 ; gtot_W(i,j) = 0.0
+        gtot_N(i,j) = 0.0 ; gtot_S(i,j) = 0.0
+      enddo
+      do k=1,nz
+        do concurrent (i=is:ie)
+          gtot_E(i,j) = gtot_E(i,j) + pbce(i,j,k) * CS%frhatu(I,j,k)
+          gtot_W(i,j) = gtot_W(i,j) + pbce(i,j,k) * CS%frhatu(I-1,j,k)
+          gtot_N(i,j) = gtot_N(i,j) + pbce(i,j,k) * CS%frhatv(i,J,k)
+          gtot_S(i,j) = gtot_S(i,j) + pbce(i,j,k) * CS%frhatv(i,J-1,k)
+        enddo
+      enddo
     enddo
-    do k=1,nz ; do concurrent (j=js:je, i=is:ie)
-      gtot_E(i,j) = gtot_E(i,j) + pbce(i,j,k) * CS%frhatu(I,j,k)
-      gtot_W(i,j) = gtot_W(i,j) + pbce(i,j,k) * CS%frhatu(I-1,j,k)
-      gtot_N(i,j) = gtot_N(i,j) + pbce(i,j,k) * CS%frhatv(i,J,k)
-      gtot_S(i,j) = gtot_S(i,j) + pbce(i,j,k) * CS%frhatv(i,J-1,k)
-    enddo ; enddo
   else
     do concurrent (j=js:je, i=is:ie)
       gtot_E(i,j) = gtot_est ; gtot_W(i,j) = gtot_est
@@ -5346,31 +5396,33 @@ subroutine bt_mass_source(h, eta, set_cor, G, GV, CS)
 
   !$omp target data map(alloc: eta_h)
 
-  if (GV%Boussinesq) then
-    do concurrent (j=js:je, i=is:ie)
-      eta_h(i,j) = h(i,j,1) - G%bathyT(i,j)*GV%Z_to_H
-    enddo
-  else
-    do concurrent (j=js:je, i=is:ie)
-      eta_h(i,j) = h(i,j,1)
-    enddo
-  endif
-  do k=2,nz
-    do concurrent (j=js:je, i=is:ie)
-      eta_h(i,j) = eta_h(i,j) + h(i,j,k)
+  do concurrent (j=js:je)
+    if (GV%Boussinesq) then
+      do concurrent (i=is:ie)
+        eta_h(i,j) = h(i,j,1) - G%bathyT(i,j)*GV%Z_to_H
+      enddo
+    else
+      do concurrent (i=is:ie)
+        eta_h(i,j) = h(i,j,1)
+      enddo
+    endif
+    do k=2,nz
+      do concurrent (i=is:ie)
+        eta_h(i,j) = eta_h(i,j) + h(i,j,k)
+      enddo
     enddo
+    if (set_cor) then
+      do concurrent (i=is:ie)
+        d_eta = eta_h(i,j) - eta(i,j)
+        CS%eta_cor(i,j) = d_eta
+      enddo
+    else
+      do concurrent (i=is:ie)
+        d_eta = eta_h(i,j) - eta(i,j)
+        CS%eta_cor(i,j) = CS%eta_cor(i,j) + d_eta
+      enddo
+    endif
   enddo
-  if (set_cor) then
-    do concurrent (j=js:je, i=is:ie)
-      d_eta = eta_h(i,j) - eta(i,j)
-      CS%eta_cor(i,j) = d_eta
-    enddo
-  else
-    do concurrent (j=js:je, i=is:ie)
-      d_eta = eta_h(i,j) - eta(i,j)
-      CS%eta_cor(i,j) = CS%eta_cor(i,j) + d_eta
-    enddo
-  endif
 
   !$omp end target data
 
diff --git a/src/core/MOM_dynamics_split_RK2.F90 b/src/core/MOM_dynamics_split_RK2.F90
index 7b82b9af95..00e8e9ccbf 100644
--- a/src/core/MOM_dynamics_split_RK2.F90
+++ b/src/core/MOM_dynamics_split_RK2.F90
@@ -1186,7 +1186,6 @@ subroutine step_MOM_dyn_split_RK2(u_inst, v_inst, h, tv, visc, Time_local, dt, f
     ! CAu = -(f+zeta_av)/h_av vh + d/dx KE_av
     call CorAdCalc(u_av, v_av, h_av, uh, vh, CS%CAu_pred, CS%CAv_pred, CS%OBC, CS%AD_pred, &
                    G, GV, US, CS%CoriolisAdv, pbv, Waves=Waves)
-    !$omp target update from(CS%CAu_pred, CS%CAv_pred)
     CS%CAu_pred_stored = .true.
     call enable_averages(dt, Time_local, CS%diag) ! Reenable the averaging
     call cpu_clock_end(id_clock_Cor)
diff --git a/src/core/MOM_forcing_type.F90 b/src/core/MOM_forcing_type.F90
index f51ec928b6..fe92d64ad4 100644
--- a/src/core/MOM_forcing_type.F90
+++ b/src/core/MOM_forcing_type.F90
@@ -1228,6 +1228,8 @@ subroutine find_ustar_fluxes(fluxes, tv, U_star, G, GV, US, halo, H_T_units)
     enddo ; enddo
   endif
 
+  !$omp target update to(U_star)
+
 end subroutine find_ustar_fluxes
 
 
@@ -1263,13 +1265,13 @@ subroutine find_ustar_mech_forcing(forces, tv, U_star, G, GV, US, halo, H_T_unit
 
   if (associated(forces%ustar) .and. (GV%Boussinesq .or. .not.associated(forces%tau_mag))) then
     if (Z_T_units) then
-      do j=js,je ; do i=is,ie
+      do concurrent (j=js:je, i=is:ie) !do j=js,je ; do i=is,ie
         U_star(i,j) = forces%ustar(i,j)
-      enddo ; enddo
+      enddo
     else
-      do j=js,je ; do i=is,ie
+      do concurrent (j=js:je, i=is:ie) !do j=js,je ; do i=is,ie
         U_star(i,j) = GV%Z_to_H * forces%ustar(i,j)
-      enddo ; enddo
+      enddo
     endif
   elseif (allocated(tv%SpV_avg)) then
     if (tv%valid_SpV_halo < 0) call MOM_error(FATAL, &
@@ -1277,20 +1279,20 @@ subroutine find_ustar_mech_forcing(forces, tv, U_star, G, GV, US, halo, H_T_unit
     if (tv%valid_SpV_halo < hs) call MOM_error(FATAL, &
         "find_ustar_mech called in non-Boussinesq mode with insufficient valid values of SpV_avg.")
     if (Z_T_units) then
-      do j=js,je ; do i=is,ie
+      do concurrent (j=js:je, i=is:ie)
         U_star(i,j) = sqrt(forces%tau_mag(i,j) * tv%SpV_avg(i,j,1))
-      enddo ; enddo
+      enddo
     else
-      do j=js,je ; do i=is,ie
+      do concurrent (j=js:je, i=is:ie)
         U_star(i,j) = GV%RZ_to_H * sqrt(forces%tau_mag(i,j) / tv%SpV_avg(i,j,1))
-      enddo ; enddo
+      enddo
     endif
   else
     I_rho = GV%Z_to_H * GV%RZ_to_H
     if (Z_T_units) I_rho = GV%H_to_Z * GV%RZ_to_H ! == 1.0 / GV%Rho0
-    do j=js,je ; do i=is,ie
+    do concurrent (j=js:je, i=is:ie)
       U_star(i,j) = sqrt(forces%tau_mag(i,j) * I_rho)
-    enddo ; enddo
+    enddo
   endif
 
 end subroutine find_ustar_mech_forcing
diff --git a/src/parameterizations/vertical/MOM_set_viscosity.F90 b/src/parameterizations/vertical/MOM_set_viscosity.F90
index 5cddbf86fe..12191fd62d 100644
--- a/src/parameterizations/vertical/MOM_set_viscosity.F90
+++ b/src/parameterizations/vertical/MOM_set_viscosity.F90
@@ -2521,7 +2521,6 @@ subroutine set_viscous_ML(u, v, h, tv, forces, visc, dt, G, GV, US, CS)
         visc%nkml_visc_v(i,J) = k_massive(i)
       endif ; enddo ; endif
 
-      !$omp target update to(visc%nkml_visc_u, visc%nkml_visc_v)
     endif ! dynamic_viscous_ML
 
     do_any_shelf = .false.
@@ -2690,6 +2689,8 @@ subroutine set_viscous_ML(u, v, h, tv, forces, visc, dt, G, GV, US, CS)
 
   enddo ! J-loop at v-points
 
+  !$omp target update to(visc%nkml_visc_u, visc%nkml_visc_v) if (CS%dynamic_viscous_ML)
+
   if (CS%debug) then
     if (allocated(visc%nkml_visc_u) .and. allocated(visc%nkml_visc_v)) &
       call uvchksum("nkml_visc_[uv]", visc%nkml_visc_u, visc%nkml_visc_v, &
diff --git a/src/parameterizations/vertical/MOM_vert_friction.F90 b/src/parameterizations/vertical/MOM_vert_friction.F90
index 210a6b2b13..26c5784a48 100644
--- a/src/parameterizations/vertical/MOM_vert_friction.F90
+++ b/src/parameterizations/vertical/MOM_vert_friction.F90
@@ -1,3 +1,5 @@
+#include "do_concurrent_compat.h"
+
 !> Implements vertical viscosity (vertvisc)
 module MOM_vert_friction
 
@@ -781,35 +783,39 @@ subroutine vertvisc(u, v, h, forces, visc, dt, OBC, ADp, CDp, G, GV, US, CS, &
     enddo
   endif
 
-  do k=2,nz
-    if (allocated(visc%Ray_u)) then
-      do concurrent (j=G%jsc:G%jec, I=Isq:Ieq)
-        Ray(I,j) = visc%Ray_u(I,j,k)
+  do concurrent (j=G%jsc:G%jec)
+    do k=2,nz
+      if (allocated(visc%Ray_u)) then
+        do concurrent (I=Isq:Ieq)
+          Ray(I,j) = visc%Ray_u(I,j,k)
+        enddo
+      endif
+
+      do concurrent (I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
+        c1(I,j,k) = dt * CS%a_u(I,j,K) * b1(I,j)
+        b_denom_1 = CS%h_u(I,j,k) + dt * (Ray(I,j) + CS%a_u(I,j,K)*d1(I,j))
+        b1(I,j) = 1.0 / (b_denom_1 + dt * CS%a_u(I,j,K+1))
+        d1(I,j) = b_denom_1 * b1(I,j)
+        u(I,j,k) = (CS%h_u(I,j,k) * u(I,j,k) + &
+                    dt * CS%a_u(I,j,K) * u(I,j,k-1)) * b1(I,j)
       enddo
-    endif
 
-    do concurrent (j=G%jsc:G%jec, I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
-      c1(I,j,k) = dt * CS%a_u(I,j,K) * b1(I,j)
-      b_denom_1 = CS%h_u(I,j,k) + dt * (Ray(I,j) + CS%a_u(I,j,K)*d1(I,j))
-      b1(I,j) = 1.0 / (b_denom_1 + dt * CS%a_u(I,j,K+1))
-      d1(I,j) = b_denom_1 * b1(I,j)
-      u(I,j,k) = (CS%h_u(I,j,k) * u(I,j,k) + &
-                  dt * CS%a_u(I,j,K) * u(I,j,k-1)) * b1(I,j)
+      if (associated(ADp%du_dt_str)) then
+        do concurrent (I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
+          ADp%du_dt_str(I,j,k) = (CS%h_u(I,j,k) * ADp%du_dt_str(I,j,k) &
+              + dt * CS%a_u(I,j,K) * ADp%du_dt_str(I,j,k-1)) * b1(I,j)
+        enddo
+      endif
     enddo
-
-    if (associated(ADp%du_dt_str)) then
-      do concurrent (j=G%jsc:G%jec, I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
-        ADp%du_dt_str(I,j,k) = (CS%h_u(I,j,k) * ADp%du_dt_str(I,j,k) &
-            + dt * CS%a_u(I,j,K) * ADp%du_dt_str(I,j,k-1)) * b1(I,j)
-      enddo
-    endif
   enddo
 
   ! back substitute to solve for the new velocities
   ! u_k = d'_k - c'_k x_(k+1)
-  do k=nz-1,1,-1
-    do concurrent (j=G%jsc:G%jec, I=Isq:Ieq, G%mask2dCu(I,j) > 0.0)
-      u(I,j,k) = u(I,j,k) + c1(I,j,k+1) * u(I,j,k+1)
+  do concurrent (j=G%jsc:G%jec)
+    do k=nz-1,1,-1
+      do concurrent (I=Isq:Ieq, G%mask2dCu(I,j) > 0.0)
+        u(I,j,k) = u(I,j,k) + c1(I,j,k+1) * u(I,j,k+1)
+      enddo
     enddo
   enddo
 
@@ -1021,32 +1027,34 @@ subroutine vertvisc(u, v, h, forces, visc, dt, OBC, ADp, CDp, G, GV, US, CS, &
     enddo
   endif
 
-  do k=2,nz
-    if (allocated(visc%Ray_v)) then
-      do concurrent (j=jsq:jeq, i=is:ie)
-        Ray(i,J) = visc%Ray_v(i,J,k)
+  do concurrent (j=Jsq:Jeq)
+    do k=2,nz
+      if (allocated(visc%Ray_v)) then
+        do concurrent (i=is:ie)
+          Ray(i,J) = visc%Ray_v(i,J,k)
+        enddo
+      endif
+
+      do concurrent (i=is:ie, G%mask2dCv(i,j) > 0.0)
+        c1(i,J,k) = dt * CS%a_v(i,J,K) * b1(i,J)
+        b_denom_1 = CS%h_v(i,J,k) + dt * (Ray(i,J) + CS%a_v(i,J,K)*d1(i,J))
+        b1(i,J) = 1.0 / (b_denom_1 + dt * CS%a_v(i,J,K+1))
+        d1(i,J) = b_denom_1 * b1(i,J)
+        v(i,J,k) = (CS%h_v(i,J,k) * v(i,J,k) + dt * CS%a_v(i,J,K) * v(i,J,k-1)) * b1(i,J)
       enddo
-    endif
 
-    do concurrent (j=jsq:jeq, i=is:ie, G%mask2dCv(i,j) > 0.0)
-      c1(i,J,k) = dt * CS%a_v(i,J,K) * b1(i,J)
-      b_denom_1 = CS%h_v(i,J,k) + dt * (Ray(i,J) + CS%a_v(i,J,K)*d1(i,J))
-      b1(i,J) = 1.0 / (b_denom_1 + dt * CS%a_v(i,J,K+1))
-      d1(i,J) = b_denom_1 * b1(i,J)
-      v(i,J,k) = (CS%h_v(i,J,k) * v(i,J,k) + dt * CS%a_v(i,J,K) * v(i,J,k-1)) * b1(i,J)
+      if (associated(ADp%dv_dt_str)) then
+        do concurrent (i=is:ie, G%mask2dCv(i,j) > 0.0)
+          ADp%dv_dt_str(i,J,k) = (CS%h_v(i,J,k) * ADp%dv_dt_str(i,J,k) &
+              + dt * CS%a_v(i,J,K) * ADp%dv_dt_str(i,J,k-1)) * b1(i,J)
+        enddo
+      endif
     enddo
 
-    if (associated(ADp%dv_dt_str)) then
-      do concurrent (J=Jsq:Jeq, i=is:ie, G%mask2dCv(i,j) > 0.0)
-        ADp%dv_dt_str(i,J,k) = (CS%h_v(i,J,k) * ADp%dv_dt_str(i,J,k) &
-            + dt * CS%a_v(i,J,K) * ADp%dv_dt_str(i,J,k-1)) * b1(i,J)
+    do k=nz-1,1,-1
+      do concurrent (i=is:ie, G%mask2dCv(i,j) > 0.0)
+        v(i,J,k) = v(i,J,k) + c1(i,J,k+1) * v(i,J,k+1)
       enddo
-    endif
-  enddo
-
-  do k=nz-1,1,-1
-    do concurrent (j=jsq:jeq, i=is:ie, G%mask2dCv(i,j) > 0.0)
-      v(i,J,k) = v(i,J,k) + c1(i,J,k+1) * v(i,J,k+1)
     enddo
   enddo
 
@@ -1306,85 +1314,89 @@ subroutine vertvisc_remnant(visc, visc_rem_u, visc_rem_v, dt, G, GV, US, CS)
 
   !$omp target enter data map(alloc: b1, c1, d1, Ray, b_denom_1)
 
-  if (allocated(visc%Ray_u)) then
-    do j=G%jsc,G%jec ; do I=Isq,Ieq
-      Ray(I,j) = visc%Ray_u(I,j,1)
-    enddo ; enddo
-  else
-    do concurrent (j=G%jsc:G%jec, I=Isq:Ieq)
-      Ray(I,j) = 0.
-    enddo
-  endif
-
-  do concurrent (j=G%jsc:G%jec, I=Isq:Ieq, G%mask2dCu(i,j) > 0.)
-    b_denom_1 = CS%h_u(I,j,1) + dt * (Ray(I,j) + CS%a_u(I,j,1))
-    b1(I,j) = 1.0 / (b_denom_1 + dt * CS%a_u(I,j,2))
-    d1(I,j) = b_denom_1 * b1(I,j)
-    visc_rem_u(I,j,1) = b1(I,j) * CS%h_u(I,j,1)
-  enddo
-
-
-  do k=2,nz
+  do concurrent (j=G%jsc:G%jec)
     if (allocated(visc%Ray_u)) then
-      do j=G%jsc,G%jec ; do I=Isq,Ieq
-        Ray(I,j) = visc%Ray_u(I,j,k)
-      enddo ; enddo
+      do concurrent (I=Isq:Ieq)
+        Ray(I,j) = visc%Ray_u(I,j,1)
+      enddo
+    else
+      do concurrent (I=Isq:Ieq)
+        Ray(I,j) = 0.
+      enddo
     endif
 
-    do concurrent (j=G%jsc:G%jec, I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
-      c1(I,j,k) = dt * CS%a_u(I,j,K)*b1(I,j)
-      b_denom_1 = CS%h_u(I,j,k) + dt * (Ray(I,j) + CS%a_u(I,j,K) * d1(I,j))
-      b1(I,j) = 1.0 / (b_denom_1 + dt * CS%a_u(I,j,K+1))
+    do concurrent (I=Isq:Ieq, G%mask2dCu(i,j) > 0.)
+      b_denom_1 = CS%h_u(I,j,1) + dt * (Ray(I,j) + CS%a_u(I,j,1))
+      b1(I,j) = 1.0 / (b_denom_1 + dt * CS%a_u(I,j,2))
       d1(I,j) = b_denom_1 * b1(I,j)
-      visc_rem_u(I,j,k) = (CS%h_u(I,j,k) + dt * CS%a_u(I,j,K) * visc_rem_u(I,j,k-1)) * b1(I,j)
+      visc_rem_u(I,j,1) = b1(I,j) * CS%h_u(I,j,1)
     enddo
-  enddo
 
-  do k=nz-1,1,-1
-    do concurrent (j=G%jsc:G%jec, I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
-      visc_rem_u(I,j,k) = visc_rem_u(I,j,k) + c1(I,j,k+1) * visc_rem_u(I,j,k+1)
-    enddo
-  enddo
+    do k=2,nz
+      if (allocated(visc%Ray_u)) then
+        do concurrent (I=Isq:Ieq)
+          Ray(I,j) = visc%Ray_u(I,j,k)
+        enddo
+      endif
 
-  ! Now find the meridional viscous remnant using the robust tridiagonal solver.
-  if (allocated(visc%Ray_v)) then
-    do J=Jsq,Jeq ; do i=is,ie
-      Ray(i,J) = visc%Ray_v(i,J,1)
-    enddo ; enddo
-  else
-    do concurrent (j=jsq:jeq, i=is:ie)
-      Ray(i,J) = 0.
+      do concurrent (I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
+        c1(I,j,k) = dt * CS%a_u(I,j,K)*b1(I,j)
+        b_denom_1 = CS%h_u(I,j,k) + dt * (Ray(I,j) + CS%a_u(I,j,K) * d1(I,j))
+        b1(I,j) = 1.0 / (b_denom_1 + dt * CS%a_u(I,j,K+1))
+        d1(I,j) = b_denom_1 * b1(I,j)
+        visc_rem_u(I,j,k) = (CS%h_u(I,j,k) + dt * CS%a_u(I,j,K) * visc_rem_u(I,j,k-1)) * b1(I,j)
+      enddo
     enddo
-  endif
 
-  do concurrent (J=Jsq:Jeq, i=is:ie, G%mask2dCv(i,j) > 0.)
-    b_denom_1 = CS%h_v(i,J,1) + dt * (Ray(i,J) + CS%a_v(i,J,1))
-    b1(i,J) = 1.0 / (b_denom_1 + dt*CS%a_v(i,J,2))
-    d1(i,J) = b_denom_1 * b1(i,J)
-    visc_rem_v(i,J,1) = b1(i,J) * CS%h_v(i,J,1)
+    do k=nz-1,1,-1
+      do concurrent (I=Isq:Ieq, G%mask2dCu(I,j) > 0.)
+        visc_rem_u(I,j,k) = visc_rem_u(I,j,k) + c1(I,j,k+1) * visc_rem_u(I,j,k+1)
+      enddo
+    enddo
   enddo
 
-  do k=2,nz
+  ! Now find the meridional viscous remnant using the robust tridiagonal solver.
+  do concurrent (J=Jsq:Jeq)
     if (allocated(visc%Ray_v)) then
-      do J=Jsq,Jeq ; do i=is,ie
-        Ray(i,J) = visc%Ray_v(i,J,k)
-      enddo ; enddo
+      do concurrent (i=is:ie)
+        Ray(i,J) = visc%Ray_v(i,J,1)
+      enddo
+    else
+      do concurrent (i=is:ie)
+        Ray(i,J) = 0.
+      enddo
     endif
 
-    do concurrent (J=Jsq:Jeq, i=is:ie, G%mask2dCv(i,j) > 0.)
-      c1(i,J,k) = dt * CS%a_v(i,J,K) * b1(i,J)
-      b_denom_1 = CS%h_v(i,J,k) + dt * (Ray(i,J) + CS%a_v(i,J,K) * d1(i,J))
-      b1(i,J) = 1.0 / (b_denom_1 + dt * CS%a_v(i,J,K+1))
+    do concurrent (i=is:ie, G%mask2dCv(i,j) > 0.)
+      b_denom_1 = CS%h_v(i,J,1) + dt * (Ray(i,J) + CS%a_v(i,J,1))
+      b1(i,J) = 1.0 / (b_denom_1 + dt*CS%a_v(i,J,2))
       d1(i,J) = b_denom_1 * b1(i,J)
-      visc_rem_v(i,J,k) = (CS%h_v(i,J,k) + dt * CS%a_v(i,J,K) * visc_rem_v(i,J,k-1)) * b1(i,J)
+      visc_rem_v(i,J,1) = b1(i,J) * CS%h_v(i,J,1)
+    enddo
+
+    do k=2,nz
+      if (allocated(visc%Ray_v)) then
+        do concurrent (i=is:ie)
+          Ray(i,J) = visc%Ray_v(i,J,k)
+        enddo
+      endif
+
+      do concurrent (i=is:ie, G%mask2dCv(i,j) > 0.)
+        c1(i,J,k) = dt * CS%a_v(i,J,K) * b1(i,J)
+        b_denom_1 = CS%h_v(i,J,k) + dt * (Ray(i,J) + CS%a_v(i,J,K) * d1(i,J))
+        b1(i,J) = 1.0 / (b_denom_1 + dt * CS%a_v(i,J,K+1))
+        d1(i,J) = b_denom_1 * b1(i,J)
+        visc_rem_v(i,J,k) = (CS%h_v(i,J,k) + dt * CS%a_v(i,J,K) * visc_rem_v(i,J,k-1)) * b1(i,J)
+      enddo
     enddo
-  enddo
 
-  do k=nz-1,1,-1
-    do concurrent (J=Jsq:Jeq, i=is:ie, G%mask2dCv(i,j) > 0.)
-      visc_rem_v(i,J,k) = visc_rem_v(i,J,k) + c1(i,J,k+1) * visc_rem_v(i,J,k+1)
+    do k=nz-1,1,-1
+      do concurrent (i=is:ie, G%mask2dCv(i,j) > 0.)
+        visc_rem_v(i,J,k) = visc_rem_v(i,J,k) + c1(i,J,k+1) * visc_rem_v(i,J,k+1)
+      enddo
     enddo
   enddo
+
   !$omp target exit data map(delete: b1, c1, d1, Ray, b_denom_1)
 
   if (CS%debug) then
@@ -1511,10 +1523,16 @@ subroutine vertvisc_coef(u, v, h, dz, forces, visc, tv, dt, G, GV, US, CS, OBC,
   if (.not.CS%initialized) call MOM_error(FATAL,"MOM_vert_friction(coef): "// &
          "Module must be initialized before it is used.")
 
+  !$omp target enter data map(alloc: I_Hbbl)
+
   h_neglect = GV%H_subroundoff
   dz_neglect = GV%dZ_subroundoff
   a_cpl_max = 1.0e37 * GV%m_to_H * US%T_to_s
-  I_Hbbl(:,:) = 1. / (CS%Hbbl + dz_neglect)
+  do concurrent (j=SZJB_(G), i=SZIB_(G))
+    I_Hbbl(i,j) = 1. / (CS%Hbbl + dz_neglect)
+  enddo
+
+  ! below allocations and initializations will probably be slow on CPU
   if (CS%use_GL90_in_SSW) then
     I_Hbbl_gl90(:,:) = 1. / (CS%Hbbl_gl90 + dz_neglect)
   endif
@@ -1552,8 +1570,8 @@ subroutine vertvisc_coef(u, v, h, dz, forces, visc, tv, dt, G, GV, US, CS, OBC,
     js_W_OBC = max(js, OBC%Js_u_W_obc) ; je_W_OBC = min(je, OBC%je_u_W_obc)
   endif
 
+  !$omp target enter data map(alloc: Ustar_2d)
   call find_ustar(forces, tv, Ustar_2d, G, GV, US, halo=1)
-  !$omp target enter data map(to: Ustar_2d)
 
   ! First do u-points
 
@@ -1917,15 +1935,17 @@ subroutine vertvisc_coef(u, v, h, dz, forces, visc, tv, dt, G, GV, US, CS, OBC,
       enddo
     endif
 
-    do K=1,nz+1
-      do concurrent (j=js:je, i=isq:ieq, do_i(i,j))
-        CS%a_u(I,j,K) = min(a_cpl_max, a_cpl(I,j,K))
+    do concurrent (j=js:je)
+      do K=1,nz+1
+        do concurrent (i=isq:ieq, do_i(i,j))
+          CS%a_u(I,j,K) = min(a_cpl_max, a_cpl(I,j,K))
+        enddo
       enddo
-    enddo
 
-    do k=1,nz
-      do concurrent (j=js:je, i=isq:ieq, do_i(i,j))
-        CS%h_u(I,j,k) = hvel(I,j,k) + h_neglect
+      do k=1,nz
+        do concurrent (i=isq:ieq, do_i(i,j))
+          CS%h_u(I,j,k) = hvel(I,j,k) + h_neglect
+        enddo
       enddo
     enddo
   endif
@@ -2302,15 +2322,17 @@ subroutine vertvisc_coef(u, v, h, dz, forces, visc, tv, dt, G, GV, US, CS, OBC,
       enddo
     endif
 
-    do K=1,nz+1
-      do concurrent (J=Jsq:Jeq, i=is:ie, do_i(i,j))
-        CS%a_v(i,J,K) = min(a_cpl_max, a_cpl(i,J,K))
+    do concurrent (J=Jsq:Jeq)
+      do K=1,nz+1
+        do concurrent (i=is:ie, do_i(i,j))
+          CS%a_v(i,J,K) = min(a_cpl_max, a_cpl(i,J,K))
+        enddo
       enddo
-    enddo
 
-    do k=1,nz
-      do concurrent (J=Jsq:Jeq, i=is:ie, do_i(i,j))
-        CS%h_v(i,J,k) = hvel(i,J,k) + h_neglect
+      do k=1,nz
+        do concurrent (i=is:ie, do_i(i,j))
+          CS%h_v(i,J,k) = hvel(i,J,k) + h_neglect
+        enddo
       enddo
     enddo
   endif
@@ -2460,7 +2482,6 @@ subroutine find_coupling_coef(a_cpl, hvel, do_i, h_harm, bbl_thick, kv_bbl, z_i,
   integer :: nz, max_nk
   integer :: is_N_OBC, is_S_OBC, Is_E_OBC, Is_W_OBC, ie_N_OBC, ie_S_OBC, Ie_E_OBC, Ie_W_OBC
   integer :: js_N_OBC, js_S_OBC, Js_E_OBC, Js_W_OBC, je_N_OBC, je_S_OBC, Je_E_OBC, Je_W_OBC
-
   if (work_on_u) then
     Is = G%IscB ; Ie = G%IecB
     js = G%jsc ; je = G%jec
@@ -2865,19 +2886,21 @@ subroutine find_coupling_coef(a_cpl, hvel, do_i, h_harm, bbl_thick, kv_bbl, z_i,
 
       max_nk = 0
       if (work_on_u) then
-        do concurrent (j=js:je, i=is:ie, do_i(i,j))
+        do concurrent (j=js:je, i=is:ie, do_i(i,j)) DO_LOCALITY(reduce(max:max_nk))
           nk_in_ml(I,j) = ceiling(visc%nkml_visc_u(I,j))
           max_nk = max(max_nk, nk_in_ml(I,j))
         enddo
         ! TODO: max_nk is transferred back, try to keep it on the device
 
-        do k=1,max_nk
-          do concurrent (j=js:je, i=is:ie, do_i(i,j))
-            if (k <= visc%nkml_visc_u(I,j)) then ! This layer is all in the ML.
-              h_ml(i,j) = h_ml(i,j) + hvel(i,j,k)
-            elseif (k < visc%nkml_visc_u(I,j) + 1.) then ! Part of this layer is in the ML.
-              h_ml(i,j) = h_ml(i,j) + ((visc%nkml_visc_u(I,j) + 1.) - k) * hvel(i,j,k)
-            endif
+        do concurrent (j=js:je)
+          do k=1,max_nk
+            do concurrent (i=is:ie, do_i(i,j))
+              if (k <= visc%nkml_visc_u(I,j)) then ! This layer is all in the ML.
+                h_ml(i,j) = h_ml(i,j) + hvel(i,j,k)
+              elseif (k < visc%nkml_visc_u(I,j) + 1.) then ! Part of this layer is in the ML.
+                h_ml(i,j) = h_ml(i,j) + ((visc%nkml_visc_u(I,j) + 1.) - k) * hvel(i,j,k)
+              endif
+            enddo
           enddo
         enddo
       else
@@ -2999,26 +3022,28 @@ subroutine find_coupling_coef(a_cpl, hvel, do_i, h_harm, bbl_thick, kv_bbl, z_i,
         endif ; enddo ; enddo
       enddo
     else
-      do K=2,max_nk
-        do concurrent (j=js:je, i=is:ie, k <= nk_in_ml(i,j))
-          z_t(i,j) = z_t(i,j) + hvel(i,j,k-1)
-
-          temp1 = (z_t(i,j) * h_ml(i,j) - z_t(i,j) * z_t(i,j))
-          !   This viscosity is set to go to 0 at the mixed layer top and bottom (in a log-layer)
-          ! and be further limited by rotation to give the natural Ekman length.
-          ! The following expressions are mathematically equivalent.
-          if (GV%Boussinesq .or. (CS%answer_date < 20230601)) then
-            visc_ml = u_star(i,j) * CS%vonKar * (GV%Z_to_H * temp1 * u_star(i,j)) &
-                / (absf(i,j) * temp1 + (h_ml(i,j)+h_neglect) * u_star(i,j))
-          else
-            visc_ml = CS%vonKar * (temp1 * tau_mag(i,j)) &
-                / (absf(i,j) * temp1 + (h_ml(i,j) + h_neglect) * u_star(i,j))
-          endif
-          a_ml = visc_ml / (0.25 * (hvel(i,j,k) + hvel(i,j,k-1) + h_neglect) + 0.5 * I_amax * visc_ml)
+      do concurrent (j=js:je)
+        do K=2,max_nk
+          do concurrent (i=is:ie, k <= nk_in_ml(i,j))
+            z_t(i,j) = z_t(i,j) + hvel(i,j,k-1)
+
+            temp1 = (z_t(i,j) * h_ml(i,j) - z_t(i,j) * z_t(i,j))
+            !   This viscosity is set to go to 0 at the mixed layer top and bottom (in a log-layer)
+            ! and be further limited by rotation to give the natural Ekman length.
+            ! The following expressions are mathematically equivalent.
+            if (GV%Boussinesq .or. (CS%answer_date < 20230601)) then
+              visc_ml = u_star(i,j) * CS%vonKar * (GV%Z_to_H * temp1 * u_star(i,j)) &
+                  / (absf(i,j) * temp1 + (h_ml(i,j)+h_neglect) * u_star(i,j))
+            else
+              visc_ml = CS%vonKar * (temp1 * tau_mag(i,j)) &
+                  / (absf(i,j) * temp1 + (h_ml(i,j) + h_neglect) * u_star(i,j))
+            endif
+            a_ml = visc_ml / (0.25 * (hvel(i,j,k) + hvel(i,j,k-1) + h_neglect) + 0.5 * I_amax * visc_ml)
 
-          ! Choose the largest estimate of a_cpl, but these could be changed to be additive.
-          a_cpl(i,j,K) = max(a_cpl(i,j,K), a_ml)
-          ! An option could be added to change this to: a_cpl(i,K) = a_cpl(i,K) + a_ml
+            ! Choose the largest estimate of a_cpl, but these could be changed to be additive.
+            a_cpl(i,j,K) = max(a_cpl(i,j,K), a_ml)
+            ! An option could be added to change this to: a_cpl(i,K) = a_cpl(i,K) + a_ml
+          enddo
         enddo
       enddo
     endif
@@ -3261,9 +3286,9 @@ subroutine vertvisc_limit_vel(u, v, h, ADp, CDp, forces, visc, dt, G, GV, US, CS
     endif
   endif
   !$omp target exit data map(release: dowrite, vel_report, trunc_any_array)
-  !$omp target exit data map(from:u_old, v_old)
 
   if (len_trim(CS%u_trunc_file) > 0) then
+    !$omp target update from(u_old)
     !I need to port this
     do j=js,je ; do I=Isq,Ieq ; if (dowrite(I,j)) then
       ! Call a diagnostic reporting subroutines are called if unphysically large values are found.
@@ -3273,6 +3298,7 @@ subroutine vertvisc_limit_vel(u, v, h, ADp, CDp, forces, visc, dt, G, GV, US, CS
   endif
 
   if (len_trim(CS%v_trunc_file) > 0) then
+    !$omp target update from(v_old)
     do J=Jsq,Jeq ; do i=is,ie ; if (dowrite(i,J)) then
       ! Call a diagnostic reporting subroutines are called if unphysically large values are found.
       call write_v_accel(i, J, v_old, h, ADp, CDp, dt, G, GV, US, CS%PointAccel_CSp, &