From dfcc89bcec7e45ae8616b0b76ee97a0d9ba9c456 Mon Sep 17 00:00:00 2001
From: Zhi Liang <Zhi.Liang@noaa.gov>
Date: Tue, 24 Jun 2014 13:38:33 -0400
Subject: [PATCH] Improve single thread performance. Also add more OMP
 directives in MOM_barotropics

---
 src/core/MOM_barotropic.F90       | 720 ++++++++++++++++++++++--------
 src/equation_of_state/MOM_EOS.F90 | 111 +++--
 2 files changed, 606 insertions(+), 225 deletions(-)

diff --git a/src/core/MOM_barotropic.F90 b/src/core/MOM_barotropic.F90
index 3ab5d4a281..4e566010a9 100644
--- a/src/core/MOM_barotropic.F90
+++ b/src/core/MOM_barotropic.F90
@@ -574,11 +574,19 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
     BTCL_v        ! A repackaged version of the v-point information in BT_cont.
   ! End of wide-sized variables.
 
+  real, dimension(SZIBW_(CS),SZJW_(CS)) :: &
+    ubt_prev, uhbt_prev, ubt_sum_prev, uhbt_sum_prev, ubt_wtd_prev  ! for OBC
+  real, dimension(SZIW_(CS),SZJBW_(CS)) :: &
+    vbt_prev, vhbt_prev, vbt_sum_prev, vhbt_sum_prev, vbt_wtd_prev  ! for OBC
+
+  real, dimension(SZIB_(G),SZJB_(G)) :: &
+    vel_trans_2D, &   ! The combination of the previous and current velocity
+                      ! that does the mass transport, in m s-1.
+    Cor_2D, gradP_2D  ! The Coriolis and pressure gradient accelerations, m s-1.
+
   real :: I_Rho0      ! The inverse of the mean density (Rho0), in m3 kg-1.
   real :: visc_rem    ! A work variable that may equal visc_rem_[uv].  Nondim.
   real :: vel_prev    ! The previous velocity in m s-1.
-  real :: vel_trans   ! The combination of the previous and current velocity
-                      ! that does the mass transport, in m s-1.
   real :: dtbt        ! The barotropic time step in s.
   real :: bebt        ! A copy of CS%bebt.
   real :: be_proj     ! The fractional amount by which velocities are projected
@@ -594,7 +602,6 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
                       ! than physical problem would suggest.
   real :: Instep      ! The inverse of the number of barotropic time steps
                       ! to take.
-  real :: Cor, gradP  ! The Coriolis and pressure gradient accelerations, m s-1.
   real :: wt_end      ! The weighting of the final value of eta_PF, ND.
   integer :: nstep    ! The number of barotropic time steps to take.
   type(time_type) :: &
@@ -730,37 +737,42 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
 ! barotropic momentum equations.  This has to be done quite early to start
 ! the halo update that needs to be completed before the next calculations.
   if (CS%linearized_BT_PV) then
-!$OMP parallel do default(none) shared(jsvf,jevf,isvf,ievf,q,CS)
+!$OMP parallel default(none) shared(jsvf,jevf,isvf,ievf,q,CS,DCor_u,DCor_v)
+!$OMP do
     do J=jsvf-2,jevf+1 ; do I=isvf-2,ievf+1
       q(I,J) = CS%q_D(I,j)
     enddo ; enddo
-!$OMP parallel do default(none) shared(jsvf,jevf,isvf,ievf,DCor_u,CS)
+!$OMP do
     do j=jsvf-1,jevf+1 ; do I=isvf-2,ievf+1
       DCor_u(I,j) = CS%D_u_Cor(I,j)
     enddo ; enddo
-!$OMP parallel do default(none) shared(jsvf,jevf,isvf,ievf,DCor_v,CS)
+!$OMP do
     do J=jsvf-2,jevf+1 ; do i=isvf-1,ievf+1
       DCor_v(i,J) = CS%D_v_Cor(i,J)
     enddo ; enddo
+!$OMP end parallel
   else
     q(:,:) = 0.0 ; DCor_u(:,:) = 0.0 ; DCor_v(:,:) = 0.0
     !  This option has not yet been written properly.
     !  D here should be replaced with D+eta(Bous) or eta(non-Bous).
-!$OMP parallel do default(none) shared(js,je,is,ie,DCor_u,G)
+!$OMP parallel default(none) shared(js,je,is,ie,DCor_u,DCor_v,G,q)
+!$OMP do
     do j=js,je ; do I=is-1,ie
       DCor_u(I,j) = 0.5 * (G%bathyT(i+1,j) + G%bathyT(i,j))
     enddo ; enddo
-!$OMP parallel do default(none) shared(js,je,is,ie,DCor_v,G)
+!$OMP do 
     do J=js-1,je ; do i=is,ie
       DCor_v(i,J) = 0.5 * (G%bathyT(i,j+1) + G%bathyT(i,j))
     enddo ; enddo
-!$OMP parallel do default(none) shared(js,je,is,ie,q,G)
+!$OMP do 
     do J=js-1,je ; do I=is-1,ie
       q(I,J) = 0.25 * G%CoriolisBu(I,J) * &
            ((G%areaT(i,j) + G%areaT(i+1,j+1)) + (G%areaT(i+1,j) + G%areaT(i,j+1))) / &
            ((G%areaT(i,j) * G%bathyT(i,j) + G%areaT(i+1,j+1) * G%bathyT(i+1,j+1)) + &
             (G%areaT(i+1,j) * G%bathyT(i+1,j) + G%areaT(i,j+1) * G%bathyT(i,j+1)))
     enddo ; enddo
+!$OMP end parallel
+
     ! With very wide halos, q and D need to be calculated on the available data
     ! domain and then updated onto the full computational domain.
     ! These calculations can be done almost immediately, but the halo updates
@@ -779,6 +791,15 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   endif
 
   ! Zero out various wide-halo arrays.
+!$OMP parallel default(none) shared(CS,gtot_E,gtot_W,gtot_N,gtot_S,eta,eta_PF,   &
+!$OMP                               interp_eta_PF,eta_PF_1,d_eta_PF,BT_force_u,  &
+!$OMP                               p_surf_dyn,dyn_coef_eta,Cor_ref_u,ubt,Datu,  &
+!$OMP                               bt_rem_u,uhbt0,Cor_ref_v,BT_force_v,vbt,     &
+!$OMP                               Datv,bt_rem_v,vhbt0,G,eta_in,eta_PF_start,   &
+!$OMP                               nz,js,je,Isq,Ieq,visc_rem_u,Instep,wt_u,     &
+!$OMP                               Jsq,Jeq,is,ie,visc_rem_v,wt_v,eta_PF_in,pbce)&
+!$OMP                               private(visc_rem)
+!$OMP do 
   do j=CS%jsdw,CS%jedw ; do i=CS%isdw,CS%iedw
     gtot_E(i,j) = 0.0 ; gtot_W(i,j) = 0.0
     gtot_N(i,j) = 0.0 ; gtot_S(i,j) = 0.0
@@ -793,10 +814,12 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   !   The halo regions of various arrays need to be initialized to
   ! non-NaNs in case the neighboring domains are not part of the ocean.
   ! Otherwise a halo update later on fills in the correct values.
+!$OMP do 
   do j=CS%jsdw,CS%jedw ; do I=CS%isdw-1,CS%iedw
     Cor_ref_u(I,j) = 0.0 ; BT_force_u(I,j) = 0.0 ; ubt(I,j) = 0.0
     Datu(I,j) = 0.0 ; bt_rem_u(I,j) = 0.0 ; uhbt0(I,j) = 0.0
   enddo ; enddo
+!$OMP do 
   do J=CS%jsdw-1,CS%jedw ; do i=CS%isdw,CS%iedw
     Cor_ref_v(i,J) = 0.0 ; BT_force_v(i,J) = 0.0 ; vbt(i,J) = 0.0
     Datv(i,J) = 0.0 ; bt_rem_v(i,J) = 0.0 ; vhbt0(I,j) = 0.0
@@ -805,19 +828,21 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   ! Copy input arrays into their wide-halo counterparts.  eta_in and eta_PF_in
   ! have the correct values in their halo regions.
   if (interp_eta_PF) then
+!$OMP do
     do j=G%jsd,G%jed ; do i=G%isd,G%ied
       eta(i,j) = eta_in(i,j)
       eta_PF_1(i,j) = eta_PF_start(i,j)
       d_eta_PF(i,j) = eta_PF_in(i,j) - eta_PF_start(i,j)
     enddo ; enddo
   else
+!$OMP do
     do j=G%jsd,G%jed ; do i=G%isd,G%ied
       eta(i,j) = eta_in(i,j)
       eta_PF(i,j) = eta_PF_in(i,j)
     enddo ; enddo
   endif
 
-!$OMP parallel do default(none) shared(Isq,Ieq,js,je,nz,visc_rem_u,Instep,wt_u,CS) private(visc_rem)
+!$OMP do
   do k=1,nz ; do j=js-1,je+1 ; do I=Isq-1,Ieq+1
     ! rem needs greater than visc_rem_u and 1-Instep/visc_rem_u.
     ! The 0.5 below is just for safety.
@@ -828,7 +853,7 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
     else ; visc_rem = 1.0 - 0.5*Instep/visc_rem_u(I,j,k) ; endif
     wt_u(I,j,k) = CS%frhatu(I,j,k) * visc_rem
   enddo ; enddo ; enddo
-!$OMP parallel do default(none) shared(is,ie,Jsq,Jeq,nz,visc_rem_v,Instep,wt_v,CS) private(visc_rem)
+!$OMP do
   do k=1,nz ; do J=Jsq-1,Jeq+1 ; do i=is-1,ie+1
     ! rem needs greater than visc_rem_v and 1-Instep/visc_rem_v.
     if (visc_rem_v(i,J,k) <= 0.0) then ; visc_rem = 0.0
@@ -843,17 +868,23 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   ! accelerations across the various faces, with names for the relative
   ! locations of the faces to the pressure point.  They will have their halos
   ! updated later on. 
-  do k=1,nz
-    do j=js,je ; do I=is-1,ie
+!$OMP do
+  do j = js,je
+    do k=1,nz; do I=is-1,ie
       gtot_E(i,j)   = gtot_E(i,j)   + pbce(i,j,k)   * wt_u(I,j,k)
       gtot_W(i+1,j) = gtot_W(i+1,j) + pbce(i+1,j,k) * wt_u(I,j,k)
     enddo ; enddo
-    do J=js-1,je ; do i=is,ie
+  enddo
+!$OMP do
+  do J=js-1,je 
+     do k=1,nz; do i=is,ie
       gtot_N(i,j)   = gtot_N(i,j)   + pbce(i,j,k)   * wt_v(i,J,k)
       gtot_S(i,j+1) = gtot_S(i,j+1) + pbce(i,j+1,k) * wt_v(i,J,k)
     enddo ; enddo
   enddo
   
+!$OMP end parallel
+
   if (CS%tides) then
     call tidal_forcing_sensitivity(G, CS%tides_CSp, det_de)
     dgeo_de = 1.0 + det_de + CS%G_extra
@@ -896,17 +927,30 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
 ! between the accelerations due to the average of the layer equations and the
 ! barotropic calculation.
   !  ### Should IDatu here be replaced with 1/D+eta(Bous) or 1/eta(non-Bous)?
+
+!$OMP parallel default(none) shared(is,ie,js,je,BT_force_u,BT_force_v,fluxes, &
+!$OMP                               I_rho0,CS,visc_rem_u,visc_rem_v,taux_bot, &
+!$OMP                               tauy_bot,nz,Isq,Ieq,Jsq,Jeq,wt_u,         &
+!$OMP                               bc_accel_u,wt_v,bc_accel_v,add_uh0,uhbt,  &
+!$OMP                               ubt,vhbt,vbt,uh0,u_uh0,vh0,v_vh0,         &
+!$OMP                               use_BT_cont,uhbt0,BTCL_u,vhbt0,BTCL_v,    &
+!$OMP                               Datu,Datv,jsvf,jevf,isvf,ievf,u_accel_bt, &
+!$OMP                               v_accel_bt,U_in,V_in,Idt)
+!$OMP do
   do j=js,je ; do I=is-1,ie
     BT_force_u(I,j) = fluxes%taux(I,j) * I_rho0*CS%IDatu(I,j)*visc_rem_u(I,j,1)
   enddo ; enddo
+!$OMP do
   do J=js-1,je ; do i=is,ie
     BT_force_v(i,J) = fluxes%tauy(i,J) * I_rho0*CS%IDatv(i,J)*visc_rem_v(i,J,1)
   enddo ; enddo
   if (present(taux_bot) .and. present(tauy_bot)) then
     if (associated(taux_bot) .and. associated(tauy_bot)) then
+!$OMP do
       do j=js,je ; do I=is-1,ie
         BT_force_u(I,j) = BT_force_u(I,j) - taux_bot(I,j) * I_rho0 * CS%IDatu(I,j)
       enddo ; enddo
+!$OMP do
       do J=js-1,je ; do i=is,ie
         BT_force_v(i,J) = BT_force_v(i,J) - tauy_bot(i,J) * I_rho0 * CS%IDatv(i,J)
       enddo ; enddo
@@ -915,37 +959,55 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
 
   ! bc_accel_u & bc_accel_v are only available on the potentially
   ! non-symmetric computational domain.
-  do k=1,nz ; do j=js,je ; do I=Isq,Ieq
-    BT_force_u(I,j) = BT_force_u(I,j) + wt_u(I,j,k) * bc_accel_u(I,j,k)
-  enddo ; enddo ; enddo
-  do k=1,nz ; do J=Jsq,Jeq ; do i=is,ie
-    BT_force_v(i,J) = BT_force_v(i,J) + wt_v(i,J,k) * bc_accel_v(i,J,k)
-  enddo ; enddo ; enddo
+!$OMP do
+  do j=js,je
+    do k=1,nz ; do I=Isq,Ieq
+       BT_force_u(I,j) = BT_force_u(I,j) + wt_u(I,j,k) * bc_accel_u(I,j,k)
+    enddo ; enddo 
+  enddo
+!$OMP do
+  do J=Jsq,Jeq
+    do k=1,nz ; do i=is,ie
+       BT_force_v(i,J) = BT_force_v(i,J) + wt_v(i,J,k) * bc_accel_v(i,J,k)
+    enddo ; enddo
+  enddo
 
   ! Determine the difference between the sum of the layer fluxes and the 
   ! barotropic fluxes found from the same input velocities.
   if (add_uh0) then
+!$OMP do 
     do j=js,je ; do I=is-1,ie ; uhbt(I,j) = 0.0 ; ubt(I,j) = 0.0 ; enddo ; enddo
+!$OMP do
     do J=js-1,je ; do i=is,ie ; vhbt(i,J) = 0.0 ; vbt(i,J) = 0.0 ; enddo ; enddo
-    do k=1,nz ; do j=js,je ; do I=is-1,ie
-      uhbt(I,j) = uhbt(I,j) + uh0(I,j,k)
-      ubt(I,j) = ubt(I,j) + wt_u(I,j,k) * u_uh0(I,j,k)
-    enddo ; enddo ; enddo
-    do k=1,nz ; do J=js-1,je ; do i=is,ie
-      vhbt(i,J) = vhbt(i,J) + vh0(i,J,k)
-      vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * v_vh0(i,J,k)
-    enddo ; enddo ; enddo
+!$OMP do
+    do j=js,je
+      do k=1,nz ; do I=is-1,ie
+        uhbt(I,j) = uhbt(I,j) + uh0(I,j,k)
+        ubt(I,j) = ubt(I,j) + wt_u(I,j,k) * u_uh0(I,j,k)
+      enddo ; enddo
+    enddo
+!$OMP do
+    do J=js-1,je
+      do k=1,nz ; do i=is,ie
+        vhbt(i,J) = vhbt(i,J) + vh0(i,J,k)
+        vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * v_vh0(i,J,k)
+      enddo ; enddo 
+    enddo
     if (use_BT_cont) then
+!$OMP do
       do j=js,je ; do I=is-1,ie
         uhbt0(I,j) = uhbt(I,j) - find_uhbt(ubt(I,j),BTCL_u(I,j))
       enddo ; enddo
+!$OMP do
       do J=js-1,je ; do i=is,ie
         vhbt0(i,J) = vhbt(i,J) - find_vhbt(vbt(i,J),BTCL_v(i,J))
       enddo ; enddo
     else
+!$OMP do
       do j=js,je ; do I=is-1,ie
         uhbt0(I,j) = uhbt(I,j) - Datu(I,j)*ubt(I,j)
       enddo ; enddo
+!$OMP do
       do J=js-1,je ; do i=is,ie
         vhbt0(i,J) = vhbt(i,J) - Datv(i,J)*vbt(i,J)
       enddo ; enddo
@@ -953,30 +1015,42 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   endif
 
 ! Calculate the initial barotropic velocities from the layer's velocities.
+!$OMP do
   do j=jsvf-1,jevf+1 ; do I=isvf-2,ievf+1
     ubt(I,j) = 0.0 ; uhbt(I,j) = 0.0 ; u_accel_bt(I,j) = 0.0
   enddo ; enddo
+!$OMP do
   do J=jsvf-2,jevf+1 ; do i=isvf-1,ievf+1
     vbt(i,J) = 0.0 ; vhbt(i,J) = 0.0 ; v_accel_bt(i,J) = 0.0
   enddo ; enddo
-  do k=1,nz ; do j=js,je ; do I=is-1,ie
-    ubt(I,j) = ubt(I,j) + wt_u(I,j,k) * U_in(I,j,k)
-  enddo ; enddo ; enddo
-  do k=1,nz ; do J=js-1,je ; do i=is,ie
-    vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * V_in(i,J,k)
-  enddo ; enddo ; enddo
+!$OMP do
+  do j=js,je
+    do k=1,nz ; do I=is-1,ie
+      ubt(I,j) = ubt(I,j) + wt_u(I,j,k) * U_in(I,j,k)
+    enddo ; enddo 
+  enddo
+
+!$OMP do
+  do J=js-1,je
+    do k=1,nz ; do i=is,ie
+      vbt(i,J) = vbt(i,J) + wt_v(i,J,k) * V_in(i,J,k)
+    enddo ; enddo
+  enddo
 
   if (CS%gradual_BT_ICs) then
+!$OMP do
     do j=js,je ; do I=is-1,ie
       BT_force_u(I,j) = BT_force_u(I,j) + (ubt(I,j) - CS%ubt_IC(I,j)) * Idt
       ubt(I,j) = CS%ubt_IC(I,j)
     enddo ; enddo
+!$OMP do
     do J=js-1,je ; do i=is,ie
       BT_force_v(i,J) = BT_force_v(i,J) + (vbt(i,J) - CS%vbt_IC(i,J)) * Idt
       vbt(i,J) = CS%vbt_IC(i,J)
     enddo ; enddo
   endif
 
+!$OMP end parallel
   if ((Isq > is-1) .or. (Jsq > js-1)) then
     ! Non-symmetric memory is being used, so the edge values need to be
     ! filled in with a halo update of a non-symmetric array.
@@ -1008,7 +1082,11 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   endif
 
   ! Determine the weighted Coriolis parameters for the neighboring velocities.
-!$OMP parallel do default(none) shared(isvf,ievf,jsvf,jevf,amer,bmer,cmer,dmer,DCor_u,q,CS)
+!$OMP parallel default(none) shared(isvf,ievf,jsvf,jevf,amer,bmer,cmer,dmer,DCor_u, &
+!$OMP                               q,CS,azon,bzon,czon,dzon,DCor_v,is,ie,js,je,nz, &
+!$OMP                               ubt_Cor,vbt_Cor,wt_u,U_Cor,wt_v,V_Cor,          &
+!$OMP                               Cor_ref_u,Cor_ref_v )
+!$OMP do
   do J=jsvf-1,jevf ; do i=isvf-1,ievf+1
     if (CS%Sadourny) then
       amer(I-1,j) = DCor_u(I-1,j) * q(I-1,J)
@@ -1027,7 +1105,7 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
     endif
   enddo ; enddo
 
-!$OMP parallel do default(none) shared(isvf,ievf,jsvf,jevf,azon,bzon,czon,dzon,DCor_v,q,CS)
+!$OMP do
   do j=jsvf-1,jevf+1 ; do I=isvf-1,ievf
     if (CS%Sadourny) then
       azon(I,j) = DCor_v(i+1,J) * q(I,J)
@@ -1048,28 +1126,36 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
 
   !   If they are present, use u_Cor and v_Cor as the reference values for the
   ! Coriolis terms, including the viscous remnant if it is present.
+!$OMP do
   do j=js-1,je+1 ; do I=is-1,ie ; ubt_Cor(I,j) = 0.0 ; enddo ; enddo
+!$OMP do
   do J=js-1,je ; do i=is-1,ie+1 ; vbt_Cor(i,J) = 0.0 ; enddo ; enddo
-  do k=1,nz ; do j=js-1,je+1 ; do I=is-1,ie
-    ubt_Cor(I,j) = ubt_Cor(I,j) + wt_u(I,j,k) * U_Cor(I,j,k)
-  enddo ; enddo ; enddo
-  do k=1,nz ; do J=js-1,je ; do i=is-1,ie+1
-    vbt_Cor(i,J) = vbt_Cor(i,J) + wt_v(i,J,k) * V_Cor(i,J,k)
-  enddo ; enddo ; enddo
+!$OMP do
+  do j=js-1,je+1
+    do k=1,nz ; do I=is-1,ie
+      ubt_Cor(I,j) = ubt_Cor(I,j) + wt_u(I,j,k) * U_Cor(I,j,k)
+    enddo ; enddo
+  enddo
+!$OMP do
+  do J=js-1,je
+    do k=1,nz ; do i=is-1,ie+1
+      vbt_Cor(i,J) = vbt_Cor(i,J) + wt_v(i,J,k) * V_Cor(i,J,k)
+    enddo ; enddo
+  enddo
 
-!$OMP parallel do default(none) shared(is,ie,js,je,Cor_ref_u,azon,bzon,czon,dzon,vbt_Cor)
+!$OMP do
   do j=js,je ; do I=is-1,ie
     Cor_ref_u(I,j) =  &
         ((azon(I,j) * vbt_Cor(i+1,j) + czon(I,j) * vbt_Cor(i  ,j-1)) + &
          (bzon(I,j) * vbt_Cor(i  ,j) + dzon(I,j) * vbt_Cor(i+1,j-1)))
   enddo ; enddo
-!$OMP parallel do default(none) shared(is,ie,js,je,Cor_ref_v,amer,bmer,cmer,dmer,ubt_Cor)
+!$OMP do
   do J=js-1,je ; do i=is,ie
     Cor_ref_v(i,J) = -1.0 * &
         ((amer(I-1,j) * ubt_Cor(I-1,j) + cmer(I  ,j+1) * ubt_Cor(I  ,j+1)) + &
          (bmer(I  ,j) * ubt_Cor(I  ,j) + dmer(I-1,j+1) * ubt_Cor(I-1,j+1)))
   enddo ; enddo
-
+!$OMP end parallel
 ! Complete the previously initiated message passing.
   if (id_clock_calc_pre > 0) call cpu_clock_end(id_clock_calc_pre)
   if (id_clock_pass_pre > 0) call cpu_clock_begin(id_clock_pass_pre)
@@ -1105,48 +1191,72 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   endif
   if (id_clock_pass_pre > 0) call cpu_clock_end(id_clock_pass_pre)
   if (id_clock_calc_pre > 0) call cpu_clock_begin(id_clock_calc_pre)
-
+!$OMP parallel default(none) shared(is,ie,js,je,nz,av_rem_u,av_rem_v,CS,visc_rem_u, &
+!$OMP                               visc_rem_v,bt_rem_u,G,nstep,bt_rem_v,Instep,    &
+!$OMP                               find_etaav,jsvf,jevf,isvf,ievf,eta_sum,eta_wtd, &
+!$OMP                               ubt_sum,uhbt_sum,PFu_bt_sum,Coru_bt_sum,ubt_wtd,&
+!$OMP                               ubt_trans,vbt_sum,vhbt_sum,PFv_bt_sum,          &
+!$OMP                               Corv_bt_sum,vbt_wtd,vbt_trans,eta_src,dt,dtbt)
+!$OMP do
   do j=js-1,je+1 ; do I=is-1,ie ; av_rem_u(I,j) = 0.0 ; enddo ; enddo
+!$OMP do
   do J=js-1,je ; do i=is-1,ie+1 ; av_rem_v(i,J) = 0.0 ; enddo ; enddo
-  do k=1,nz ; do j=js-1,je+1 ; do I=is-1,ie
-    av_rem_u(I,j) = av_rem_u(I,j) + CS%frhatu(I,j,k) * visc_rem_u(I,j,k)
-  enddo ; enddo ; enddo
-  do k=1,nz ; do J=js-1,je ; do i=is-1,ie+1
-    av_rem_v(i,J) = av_rem_v(i,J) + CS%frhatv(i,J,k) * visc_rem_v(i,J,k)
-  enddo ; enddo ; enddo
+!$OMP do
+  do j=js-1,je+1
+    do k=1,nz ; do I=is-1,ie
+      av_rem_u(I,j) = av_rem_u(I,j) + CS%frhatu(I,j,k) * visc_rem_u(I,j,k)
+    enddo ; enddo 
+  enddo
+!$OMP do
+  do J=js-1,je 
+    do k=1,nz ; do i=is-1,ie+1
+      av_rem_v(i,J) = av_rem_v(i,J) + CS%frhatv(i,J,k) * visc_rem_v(i,J,k)
+    enddo ; enddo 
+  enddo
   if (CS%strong_drag) then
+!$OMP do
     do j=js-1,je+1 ; do I=is-1,ie
       bt_rem_u(I,j) = G%mask2dCu(I,j) * &
          ((nstep * av_rem_u(I,j)) / (1.0 + (nstep-1)*av_rem_u(I,j)))
     enddo ; enddo
+!$OMP do
     do J=js-1,je ; do i=is-1,ie+1
       bt_rem_v(i,J) = G%mask2dCv(i,J) * &
          ((nstep * av_rem_v(i,J)) / (1.0 + (nstep-1)*av_rem_v(i,J)))
     enddo ; enddo
   else
+!$OMP do
     do j=js-1,je+1 ; do I=is-1,ie
       bt_rem_u(I,j) = 0.0
       if (G%mask2dCu(I,j) * av_rem_u(I,j) > 0.0) &
         bt_rem_u(I,j) = G%mask2dCu(I,j) * (av_rem_u(I,j)**Instep)
     enddo ; enddo
+!$OMP do
     do J=js-1,je ; do i=is-1,ie+1
       bt_rem_v(i,J) = 0.0
       if (G%mask2dCv(i,J) * av_rem_v(i,J) > 0.0) &
         bt_rem_v(i,J) = G%mask2dCv(i,J) * (av_rem_v(i,J)**Instep)
     enddo ; enddo
   endif
-
   ! Zero out the arrays for various time-averaged quantities.
-  if (find_etaav) then ; do j=jsvf-1,jevf+1 ; do i=isvf-1,ievf+1
-    eta_sum(i,j) = 0.0 ; eta_wtd(i,j) = 0.0
-  enddo ; enddo ; else ; do j=jsvf-1,jevf+1 ; do i=isvf-1,ievf+1
-    eta_wtd(i,j) = 0.0
-  enddo ; enddo ; endif
+  if (find_etaav) then  
+!$OMP do
+    do j=jsvf-1,jevf+1 ; do i=isvf-1,ievf+1
+      eta_sum(i,j) = 0.0 ; eta_wtd(i,j) = 0.0
+    enddo ; enddo 
+  else 
+!$OMP do
+    do j=jsvf-1,jevf+1 ; do i=isvf-1,ievf+1
+      eta_wtd(i,j) = 0.0
+    enddo ; enddo 
+   endif
+!$OMP do
   do j=jsvf-1,jevf+1 ; do I=isvf-1,ievf
     ubt_sum(I,j) = 0.0 ; uhbt_sum(I,j) = 0.0
     PFu_bt_sum(I,j) = 0.0 ; Coru_bt_sum(I,j) = 0.0
     ubt_wtd(I,j) = 0.0 ; ubt_trans(I,j) = 0.0
   enddo ; enddo
+!$OMP do
   do J=jsvf-1,jevf ; do i=isvf-1,ievf+1
     vbt_sum(i,J) = 0.0 ; vhbt_sum(i,J) = 0.0
     PFv_bt_sum(i,J) = 0.0 ; Corv_bt_sum(i,J) = 0.0
@@ -1154,14 +1264,17 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
   enddo ; enddo
 
   ! Set the mass source, after first initializing the halos to 0.
+!$OMP do
   do j=jsvf-1,jevf+1; do i=isvf-1,ievf+1 ; eta_src(i,j) = 0.0 ; enddo ; enddo
   if (CS%bound_BT_corr) then ; do j=js,je ; do i=is,ie
     if (abs(CS%eta_cor(i,j)) > dt*CS%eta_cor_bound(i,j)) &
       CS%eta_cor(i,j) = sign(dt*CS%eta_cor_bound(i,j),CS%eta_cor(i,j))
   enddo ; enddo ; endif
+!$OMP do
   do j=js,je ; do i=is,ie
     eta_src(i,j) = G%mask2dT(i,j) * (Instep * CS%eta_cor(i,j) + dtbt * CS%eta_source(i,j))
   enddo ; enddo
+!$OMP end parallel
 
   if (CS%dynamic_psurf) then
     ice_is_rigid = (associated(fluxes%rigidity_ice_u) .and. &
@@ -1169,7 +1282,12 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
     H_min_dyn = G%m_to_H * CS%Dmin_dyn_psurf
     if (ice_is_rigid .and. use_BT_cont) &
       call BT_cont_to_face_areas(BT_cont, Datu, Datv, G, MS, 0, .true.)
-    if (ice_is_rigid) then ; do j=js,je ; do i=is,ie
+    if (ice_is_rigid) then 
+!$OMP parallel do default(none) shared(is,ie,js,je,dgeo_de,bebt,G,gtot_E,Datu,  &
+!$OMP                                  gtot_W,gtot_N,gtot_S,Datv,H_min_dyn,     &
+!$OMP                                  CS,dtbt,fluxes,dyn_coef_eta )            &
+!$OMP                          private(Idt_max2,H_eff_dx2,dyn_coef_max,ice_strength)
+      do j=js,je ; do i=is,ie
       ! First determine the maximum stable value for dyn_coef_eta.
 
       !   This estimate of the maximum stable time step is pretty accurate for
@@ -1429,22 +1547,30 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
         call find_face_areas(Datu, Datv, G, CS, MS, eta, 1+iev-ie)
     endif
 
+!$OMP parallel default(none) shared(CS,isv,iev,jsv,jev,project_velocity,use_BT_cont, &
+!$OMP                               uhbt,vhbt,ubt,BTCL_u,uhbt0,vbt,BTCL_v,vhbt0,     &
+!$OMP                               eta_pred,eta,eta_src,dtbt,Datu,Datv,p_surf_dyn,  &
+!$OMP                               dyn_coef_eta,find_etaav,is,ie,js,je,eta_sum,     &
+!$OMP                               wt_accel2,n,eta_PF_BT,interp_eta_PF,wt_end,      &
+!$OMP                               Instep,eta_PF,eta_PF_1,d_eta_PF,                 &
+!$OMP                               apply_OBC_flather,ubt_old,vbt_old )
     if (CS%dynamic_psurf .or. .not.project_velocity) then
       if (use_BT_cont) then
-!$OMP parallel do default(none) shared(isv,iev,jsv,jev,uhbt,ubt,BTCL_u,uhbt0)
+!$OMP do
         do j=jsv-1,jev+1 ; do I=isv-2,iev+1
           uhbt(I,j) = find_uhbt(ubt(I,j),BTCL_u(I,j)) + uhbt0(I,j)
         enddo ; enddo
+!$OMP do
         do J=jsv-2,jev+1 ; do i=isv-1,iev+1
           vhbt(i,J) = find_vhbt(vbt(i,J),BTCL_v(i,J)) + vhbt0(i,J)
         enddo ; enddo
+!$OMP do
         do j=jsv-1,jev+1 ; do i=isv-1,iev+1
           eta_pred(i,j) = (eta(i,j) + eta_src(i,j)) + (dtbt * CS%IareaT(i,j)) * &
                      ((uhbt(I-1,j) - uhbt(I,j)) + (vhbt(i,J-1) - vhbt(i,J)))
         enddo ; enddo
       else
-!$OMP parallel do default(none) shared(isv,iev,jsv,jev,eta_pred,eta,eta_src,dtbt,&
-!$OMP                                  CS,Datu,ubt,uhbt0,Datv,vhbt0,vbt)
+!$OMP do
         do j=jsv-1,jev+1 ; do i=isv-1,iev+1
           eta_pred(i,j) = (eta(i,j) + eta_src(i,j)) + (dtbt * CS%IareaT(i,j)) * &
               (((Datu(I-1,j)*ubt(I-1,j) + uhbt0(I-1,j)) - &
@@ -1454,35 +1580,43 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
         enddo ; enddo
       endif
 
-      if (CS%dynamic_psurf) then ; do j=jsv-1,jev+1 ; do i=isv-1,iev+1
-        p_surf_dyn(i,j) = dyn_coef_eta(I,j) * (eta_pred(i,j) - eta(i,j))
-      enddo ; enddo ; endif
+      if (CS%dynamic_psurf) then 
+!$OMP do
+        do j=jsv-1,jev+1 ; do i=isv-1,iev+1
+          p_surf_dyn(i,j) = dyn_coef_eta(I,j) * (eta_pred(i,j) - eta(i,j))
+        enddo ; enddo 
+      endif
     endif
 
     ! Recall that just outside the do n loop, there is code like...
     !  eta_PF_BT => eta_pred ; if (project_velocity) eta_PF_BT => eta
 
-    if (find_etaav) then ; do j=js,je ; do i=is,ie
-      eta_sum(i,j) = eta_sum(i,j) + wt_accel2(n) * eta_PF_BT(i,j)
-    enddo ; enddo ; endif
+    if (find_etaav) then 
+!$OMP do
+      do j=js,je ; do i=is,ie
+        eta_sum(i,j) = eta_sum(i,j) + wt_accel2(n) * eta_PF_BT(i,j)
+      enddo ; enddo
+    endif
 
     if (interp_eta_PF) then
       wt_end = n*Instep  ! This could be (n-0.5)*Instep.
+!$OMP do
       do j=jsv-1,jev+1 ; do i=isv-1,iev+1
         eta_PF(i,j) = eta_PF_1(i,j) + wt_end*d_eta_PF(i,j)
       enddo ; enddo
     endif
 
     if (apply_OBC_flather) then
-!$OMP parallel do default(none) shared(isv,iev,jsv,jev,ubt_old,ubt) 
+!$OMP do
       do j=jsv,jev ; do I=isv-2,iev+1
         ubt_old(I,j) = ubt(I,j)
       enddo; enddo
-!$OMP parallel do default(none) shared(isv,iev,jsv,jev,vbt_old,vbt)
+!$OMP do
       do J=jsv-2,jev+1 ; do i=isv,iev
         vbt_old(i,J) = vbt(i,J)
       enddo ; enddo
     endif
+!$OMP end parallel
 
 !$OMP parallel default(none) shared(isv,iev,jsv,jev,G,amer,ubt,cmer,bmer,dmer,eta_PF_BT, &
 !$OMP                               eta_PF,gtot_N,gtot_S,dgeo_de,CS,p_surf_dyn,n,        &
@@ -1494,147 +1628,349 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, &
 !$OMP                               azon,bzon,czon,dzon,Cor_ref_u,gtot_E,gtot_W,         &
 !$OMP                               u_accel_bt,PFu_bt_sum,Coru_bt_sum,apply_u_OBCs,      &
 !$OMP                               bt_rem_u,BT_force_u,uhbt,BTCL_u,uhbt0,Datu,ubt_sum,  &
-!$OMP                               uhbt_sum,ubt_wtd)                                    &
-!$OMP                       private(Cor, gradP, vel_prev, vel_trans )
+!$OMP                               uhbt_sum,ubt_wtd,Cor_2D,gradP_2D,vbt_prev,vhbt_prev, &
+!$OMP                               vbt_sum_prev,vhbt_sum_prev,vbt_wtd_prev,vel_trans_2D,&
+!$OMP                               ubt_prev,uhbt_prev,ubt_sum_prev,uhbt_sum_prev,       &
+!$OMP                               ubt_wtd_prev )                                       &
+!$OMP                       private(vel_prev)
     if (MOD(n+G%first_direction,2)==1) then
       ! On odd-steps, update v first.
 !$OMP do
       do J=jsv-1,jev ; do i=isv-1,iev+1
-        Cor = -1.0*((amer(I-1,j) * ubt(I-1,j) + cmer(I,j+1) * ubt(I,j+1)) + &
+        Cor_2D(i,j) = -1.0*((amer(I-1,j) * ubt(I-1,j) + cmer(I,j+1) * ubt(I,j+1)) + &
                (bmer(I,j) * ubt(I,j) + dmer(I-1,j+1) * ubt(I-1,j+1))) - Cor_ref_v(i,J)
-        gradP = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_N(i,j) - &
-                 (eta_PF_BT(i,j+1)-eta_PF(i,j+1))*gtot_S(i,j+1)) * &
-                dgeo_de * CS%IdyCv(i,J)
-        if (CS%dynamic_psurf) &
-          gradP = gradP + (p_surf_dyn(i,j) - p_surf_dyn(i,j+1)) * CS%IdyCv(i,J)
-        v_accel_bt(i,J) = v_accel_bt(i,J) + wt_accel(n) * (Cor + gradP)
-        if (find_PF)  PFv_bt_sum(i,J)  = PFv_bt_sum(i,J) + wt_accel2(n) * gradP
-        if (find_Cor) Corv_bt_sum(i,J) = Corv_bt_sum(i,J) + wt_accel2(n) * Cor
+        gradP_2D(i,j) = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_N(i,j) - &
+               (eta_PF_BT(i,j+1)-eta_PF(i,j+1))*gtot_S(i,j+1)) * &
+               dgeo_de * CS%IdyCv(i,J)
+      enddo; enddo
+      if(CS%dynamic_psurf) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1
+          gradP_2D(i,j) = gradP_2D(i,j) + (p_surf_dyn(i,j) - p_surf_dyn(i,j+1)) * CS%IdyCv(i,J)
+        enddo; enddo
+      endif
+!$OMP do
+      do J=jsv-1,jev ; do i=isv-1,iev+1
+        v_accel_bt(i,J) = v_accel_bt(i,J) + wt_accel(n) * (Cor_2D(i,j) + gradP_2D(i,j))
+      enddo; enddo
 
-        if (apply_v_OBCs) then ; if (BT_OBC%OBC_mask_v(i,J)) cycle ; endif
+      if (find_PF) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1
+          PFv_bt_sum(i,J)  = PFv_bt_sum(i,J) + wt_accel2(n) * gradP_2D(i,j)
+        enddo; enddo
+      endif
+      if (find_Cor) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1
+          Corv_bt_sum(i,J) = Corv_bt_sum(i,J) + wt_accel2(n) * Cor_2D(i,j)
+        enddo; enddo
+      endif
+      
+      if(apply_v_OBCs) then  ! save the old value of vbt and vhbt
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1
+          vbt_prev(i,J) = vbt(i,J); vhbt_prev(i,J) = vhbt(i,J)
+          vbt_sum_prev(i,J)=vbt_sum(i,J); vhbt_sum_prev(i,J)=vhbt_sum(i,J) ; vbt_wtd_prev(i,J) = vbt_wtd_prev(i,J)
+        enddo ; enddo
+      endif
 
-        vel_prev = vbt(i,J)
-        vbt(i,J) = bt_rem_v(i,J) * (vbt(i,J) + &
-                        dtbt * ((BT_force_v(i,J) + Cor) + gradP))
-        if (project_velocity) then
-          vel_trans = (1.0 + be_proj)*vbt(i,J) - be_proj*vel_prev
-        else
-          vel_trans = (1.0-bebt)*vel_prev + bebt*vbt(i,J)
-        endif
-        if (use_BT_cont) then
-          vhbt(i,J) = find_vhbt(vel_trans,BTCL_v(i,J)) + vhbt0(i,J)
-        else
-          vhbt(i,J) = Datv(i,J)*vel_trans + vhbt0(i,J)
-        endif
+      if (project_velocity) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1  
+          vel_prev = vbt(i,J)
+          vbt(i,J) = bt_rem_v(i,J) * (vbt(i,J) + &
+                      dtbt * ((BT_force_v(i,J) + Cor_2D(i,j)) + gradP_2D(i,j)))
+          vel_trans_2D(i,j) = (1.0 + be_proj)*vbt(i,J) - be_proj*vel_prev
+        enddo; enddo
+      else
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1  
+          vel_prev = vbt(i,J)
+          vbt(i,J) = bt_rem_v(i,J) * (vbt(i,J) + &
+                      dtbt * ((BT_force_v(i,J) + Cor_2D(i,j)) + gradP_2D(i,j)))
+          vel_trans_2D(i,j) = (1.0-bebt)*vel_prev + bebt*vbt(i,J)
+        enddo; enddo
+      endif
 
-        vbt_sum(i,J) = vbt_sum(i,J) + wt_trans(n) * vel_trans
+      if (use_BT_cont) then
+!$OMP do
+         do J=jsv-1,jev ; do i=isv-1,iev+1  
+            vhbt(i,J) = find_vhbt(vel_trans_2D(i,j),BTCL_v(i,J)) + vhbt0(i,J)
+         enddo; enddo
+      else
+!$OMP do
+         do J=jsv-1,jev ; do i=isv-1,iev+1  
+            vhbt(i,J) = Datv(i,J)*vel_trans_2D(i,j) + vhbt0(i,J)
+         enddo; enddo
+      endif
+!$OMP do
+      do J=jsv-1,jev ; do i=isv-1,iev+1
+        vbt_sum(i,J) = vbt_sum(i,J) + wt_trans(n) * vel_trans_2D(i,j)
         vhbt_sum(i,J) = vhbt_sum(i,J) + wt_trans(n) * vhbt(i,J)
         vbt_wtd(i,J) = vbt_wtd(i,J) + wt_vel(n) * vbt(i,J)
-      enddo ; enddo
+      enddo; enddo
+      if(apply_v_OBCs) then  ! copy back the value for the points that OBC_mask_v is true.
+!$OMP do
+        do J=jsv-1,jev ; do i=isv-1,iev+1
+          if (BT_OBC%OBC_mask_v(i,J)) then          
+            vbt(i,J) = vbt_prev(i,J); vhbt(i,J) = vhbt_prev(i,J)
+            vbt_sum(i,J)=vbt_sum_prev(i,J); vhbt_sum(i,J)=vhbt_sum_prev(i,J) ; vbt_wtd(i,J)=vbt_wtd_prev(i,J)
+          endif
+        enddo ; enddo
+      endif      
 
 !$OMP do
       do j=jsv,jev ; do I=isv-1,iev
-        Cor = ((azon(I,j) * vbt(i+1,J) + czon(I,j) * vbt(i,J-1)) + &
+        Cor_2D(i,j) = ((azon(I,j) * vbt(i+1,J) + czon(I,j) * vbt(i,J-1)) + &
                (bzon(I,j) * vbt(i,J) + dzon(I,j) * vbt(i+1,J-1))) - Cor_ref_u(I,j)
-        gradP = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_E(i,j) - &
+        gradP_2D(i,j) = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_E(i,j) - &
                  (eta_PF_BT(i+1,j)-eta_PF(i+1,j))*gtot_W(i+1,j)) * &
                 dgeo_de * CS%IdxCu(I,j)
-        if (CS%dynamic_psurf) &
-          gradP = gradP + (p_surf_dyn(i,j) - p_surf_dyn(i+1,j)) * CS%IdxCu(I,j)
-        u_accel_bt(I,j) = u_accel_bt(I,j) + wt_accel(n) * (Cor + gradP)
-        if (find_PF)  PFu_bt_sum(I,j)  = PFu_bt_sum(I,j) + wt_accel2(n) * gradP
-        if (find_Cor) Coru_bt_sum(I,j) = Coru_bt_sum(I,j) + wt_accel2(n) * Cor
-
-        if (apply_u_OBCs) then ; if (BT_OBC%OBC_mask_u(I,j)) cycle ; endif
+      enddo; enddo
+      if (CS%dynamic_psurf) then
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+          gradP_2D(i,j) = gradP_2D(i,j) + (p_surf_dyn(i,j) - p_surf_dyn(i+1,j)) * CS%IdxCu(I,j)
+        enddo; enddo
+      endif     
+!$OMP do
+      do j=jsv,jev ; do I=isv-1,iev
+        u_accel_bt(I,j) = u_accel_bt(I,j) + wt_accel(n) * (Cor_2D(i,j) + gradP_2D(i,j))
+      enddo; enddo
 
-        vel_prev = ubt(I,j)
-        ubt(I,j) = bt_rem_u(I,j) * (ubt(I,j) + &
-                        dtbt * ((BT_force_u(I,j) + Cor) + gradP))
-        if (project_velocity) then
-          vel_trans = (1.0 + be_proj)*ubt(I,j) - be_proj*vel_prev
-        else
-          vel_trans = (1.0-bebt)*vel_prev + bebt*ubt(I,j)
-        endif
-        if (use_BT_cont) then
-          uhbt(I,j) = find_uhbt(vel_trans, BTCL_u(I,j)) + uhbt0(I,j)
-        else
-          uhbt(I,j) = Datu(I,j)*vel_trans + uhbt0(I,j)
-        endif
+      if (find_PF) then
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+           PFu_bt_sum(I,j)  = PFu_bt_sum(I,j) + wt_accel2(n) * gradP_2D(i,j)
+        enddo; enddo
+      endif
+      if (find_Cor) then
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+          Coru_bt_sum(I,j) = Coru_bt_sum(I,j) + wt_accel2(n) * Cor_2D(i,j)
+        enddo; enddo
+      endif
 
-        ubt_sum(I,j) = ubt_sum(I,j) + wt_trans(n) * vel_trans
+      if(apply_u_OBCs) then  ! save the old value of vbt and vhbt
+!$OMP do
+        do J=jsv,jev ; do i=isv-1,iev
+          ubt_prev(i,J) = ubt(i,J); uhbt_prev(i,J) = uhbt(i,J)
+          ubt_sum_prev(i,J)=ubt_sum(i,J); uhbt_sum_prev(i,J)=uhbt_sum(i,J) ; ubt_wtd_prev(i,J)=ubt_wtd_prev(i,J)
+        enddo ; enddo
+      endif
+      if (project_velocity) then
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+          vel_prev = ubt(I,j)
+          ubt(I,j) = bt_rem_u(I,j) * (ubt(I,j) + &
+               dtbt * ((BT_force_u(I,j) + Cor_2D(i,j)) + gradP_2D(i,j)))
+          vel_trans_2D(i,j) = (1.0 + be_proj)*ubt(I,j) - be_proj*vel_prev
+        enddo; enddo
+      else
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+          vel_prev = ubt(I,j)
+          ubt(I,j) = bt_rem_u(I,j) * (ubt(I,j) + &
+               dtbt * ((BT_force_u(I,j) + Cor_2D(i,j)) + gradP_2D(i,j)))
+          vel_trans_2D(i,j) = (1.0-bebt)*vel_prev + bebt*ubt(I,j)
+        enddo; enddo
+      endif
+      if (use_BT_cont) then
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+          uhbt(I,j) = find_uhbt(vel_trans_2D(i,j), BTCL_u(I,j)) + uhbt0(I,j)
+        enddo; enddo
+      else
+!$OMP do
+        do j=jsv,jev ; do I=isv-1,iev
+          uhbt(I,j) = Datu(I,j)*vel_trans_2D(i,j) + uhbt0(I,j)
+        enddo; enddo
+      endif
+!$OMP do
+      do j=jsv,jev ; do I=isv-1,iev
+        ubt_sum(I,j) = ubt_sum(I,j) + wt_trans(n) * vel_trans_2D(i,j)
         uhbt_sum(I,j) = uhbt_sum(I,j) + wt_trans(n) * uhbt(I,j)
         ubt_wtd(I,j) = ubt_wtd(I,j) + wt_vel(n) * ubt(I,j)
-      enddo ; enddo
+      enddo; enddo
 
+      if(apply_u_OBCs) then  ! copy back the value for the points that OBC_mask_v is true.
+!$OMP do
+        do J=jsv,jev ; do i=isv-1,iev
+          if (BT_OBC%OBC_mask_u(i,J)) then          
+            ubt(i,J) = ubt_prev(i,J); uhbt(i,J) = uhbt_prev(i,J)
+            ubt_sum(i,J)=ubt_sum_prev(i,J); uhbt_sum(i,J)=uhbt_sum_prev(i,J) ; ubt_wtd(i,J)=ubt_wtd_prev(i,J)
+          endif
+        enddo ; enddo
+      endif      
     else
       ! On even steps, update u first.
 !$OMP do
       do j=jsv-1,jev+1 ; do I=isv-1,iev
-        Cor = ((azon(I,j) * vbt(i+1,J) + czon(I,j) * vbt(i,J-1)) + &
+        Cor_2D(i,j) = ((azon(I,j) * vbt(i+1,J) + czon(I,j) * vbt(i,J-1)) + &
                (bzon(I,j) * vbt(i,J) +  dzon(I,j) * vbt(i+1,J-1))) - Cor_ref_u(I,j)
-        gradP = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_E(i,j) - &
+        gradP_2D(i,j) = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_E(i,j) - &
                  (eta_PF_BT(i+1,j)-eta_PF(i+1,j))*gtot_W(i+1,j)) * &
-                dgeo_de * CS%IdxCu(I,j)
-        if (CS%dynamic_psurf) &
-          gradP = gradP + (p_surf_dyn(i,j) - p_surf_dyn(i+1,j)) * CS%IdxCu(I,j)
-        u_accel_bt(I,j) = u_accel_bt(I,j) + wt_accel(n) * (Cor + gradP)
-        if (find_PF)  PFu_bt_sum(I,j)  = PFu_bt_sum(I,j) + wt_accel2(n) * gradP
-        if (find_Cor) Coru_bt_sum(I,j) = Coru_bt_sum(I,j) + wt_accel2(n) * Cor
+                 dgeo_de * CS%IdxCu(I,j)
+      enddo; enddo
 
-        if (apply_u_OBCs) then ; if (BT_OBC%OBC_mask_u(I,j)) cycle ; endif
+      if (CS%dynamic_psurf) then
+!$OMP do
+        do j=jsv-1,jev+1 ; do I=isv-1,iev
+          gradP_2D(i,j) = gradP_2D(i,j) + (p_surf_dyn(i,j) - p_surf_dyn(i+1,j)) * CS%IdxCu(I,j)
+        enddo; enddo
+      endif
+!$OMP do
+      do j=jsv-1,jev+1 ; do I=isv-1,iev
+        u_accel_bt(I,j) = u_accel_bt(I,j) + wt_accel(n) * (Cor_2D(i,j) + gradP_2D(i,j))
+      enddo; enddo
 
-        vel_prev = ubt(I,j)
-        ubt(I,j) = bt_rem_u(I,j) * (ubt(I,j) + &
-                        dtbt * ((BT_force_u(I,j) + Cor) + gradP))
-        if (project_velocity) then
-          vel_trans = (1.0 + be_proj)*ubt(I,j) - be_proj*vel_prev
-        else
-          vel_trans = (1.0-bebt)*vel_prev + bebt*ubt(I,j)
-        endif
-        if (use_BT_cont) then
-          uhbt(I,j) = find_uhbt(vel_trans,BTCL_u(I,j)) + uhbt0(I,j)
-        else
-          uhbt(I,j) = Datu(I,j)*vel_trans + uhbt0(I,j)
-        endif
+      if (find_PF) then
+!$OMP do
+        do j=jsv-1,jev+1 ; do I=isv-1,iev
+          PFu_bt_sum(I,j)  = PFu_bt_sum(I,j) + wt_accel2(n) * gradP_2D(i,j)
+        enddo; enddo
+      endif
+      if (find_Cor) then
+!$OMP do
+        do j=jsv-1,jev+1 ; do I=isv-1,iev
+          Coru_bt_sum(I,j) = Coru_bt_sum(I,j) + wt_accel2(n) * Cor_2D(i,j)
+        enddo; enddo
+      endif
+      if(apply_u_OBCs) then  ! save the old value of vbt and vhbt
+!$OMP do
+        do J=jsv-1,jev+1 ; do i=isv-1,iev
+          ubt_prev(i,J) = ubt(i,J); uhbt_prev(i,J) = uhbt(i,J)
+          ubt_sum_prev(i,J)=ubt_sum(i,J); uhbt_sum_prev(i,J)=uhbt_sum(i,J) ; ubt_wtd_prev(i,J)=ubt_wtd_prev(i,J)
+        enddo ; enddo
+      endif
 
-        ubt_sum(I,j) = ubt_sum(I,j) + wt_trans(n) * vel_trans
+      if (project_velocity) then
+!$OMP do
+        do j=jsv-1,jev+1 ; do I=isv-1,iev
+          vel_prev = ubt(I,j)
+          ubt(I,j) = bt_rem_u(I,j) * (ubt(I,j) + &
+               dtbt * ((BT_force_u(I,j) + Cor_2D(i,j)) + gradP_2D(i,j)))
+          vel_trans_2D(i,j) = (1.0 + be_proj)*ubt(I,j) - be_proj*vel_prev
+        enddo; enddo
+      else
+!$OMP do
+       do j=jsv-1,jev+1 ; do I=isv-1,iev
+          vel_prev = ubt(I,j)
+          ubt(I,j) = bt_rem_u(I,j) * (ubt(I,j) + &
+               dtbt * ((BT_force_u(I,j) + Cor_2D(i,j)) + gradP_2D(i,j)))
+          vel_trans_2D(i,j) = (1.0-bebt)*vel_prev + bebt*ubt(I,j)
+        enddo; enddo
+      endif
+      if (use_BT_cont) then
+!$OMP do
+        do j=jsv-1,jev+1 ; do I=isv-1,iev
+          uhbt(I,j) = find_uhbt(vel_trans_2D(i,j),BTCL_u(I,j)) + uhbt0(I,j)
+        enddo; enddo
+      else
+!$OMP do
+        do j=jsv-1,jev+1 ; do I=isv-1,iev
+          uhbt(I,j) = Datu(I,j)*vel_trans_2D(i,j) + uhbt0(I,j)
+        enddo; enddo
+      endif
+!$OMP do
+      do j=jsv-1,jev+1 ; do I=isv-1,iev
+        ubt_sum(I,j) = ubt_sum(I,j) + wt_trans(n) * vel_trans_2D(i,j)
         uhbt_sum(I,j) = uhbt_sum(I,j) + wt_trans(n) * uhbt(I,j)
         ubt_wtd(I,j) = ubt_wtd(I,j) + wt_vel(n) * ubt(I,j)
-      enddo ; enddo
+      enddo; enddo
+      if(apply_u_OBCs) then  ! copy back the value for the points that OBC_mask_v is true.
+!$OMP do
+        do J=jsv-1,jev+1 ; do i=isv-1,iev
+          if (BT_OBC%OBC_mask_u(i,J)) then          
+            ubt(i,J) = ubt_prev(i,J); uhbt(i,J) = uhbt_prev(i,J)
+            ubt_sum(i,J)=ubt_sum_prev(i,J); uhbt_sum(i,J)=uhbt_sum_prev(i,J) ; ubt_wtd(i,J)=ubt_wtd_prev(i,J)
+          endif
+        enddo ; enddo
+      endif      
 
 !$OMP do
       do J=jsv-1,jev ; do i=isv,iev
-        Cor = -1.0*((amer(I-1,j) * ubt(I-1,j) + bmer(I,j) * ubt(I,j)) + &
+        Cor_2D(i,j) = -1.0*((amer(I-1,j) * ubt(I-1,j) + bmer(I,j) * ubt(I,j)) + &
                 (cmer(I,j+1) * ubt(I,j+1) + dmer(I-1,j+1) * ubt(I-1,j+1))) - Cor_ref_v(i,J)
-        gradP = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_N(i,j) - &
+        gradP_2D(i,j) = ((eta_PF_BT(i,j)-eta_PF(i,j))*gtot_N(i,j) - &
                  (eta_PF_BT(i,j+1)-eta_PF(i,j+1))*gtot_S(i,j+1)) * &
-                dgeo_de * CS%IdyCv(i,J)
-        if (CS%dynamic_psurf) &
-          gradP = gradP + (p_surf_dyn(i,j) - p_surf_dyn(i,j+1)) * CS%IdyCv(i,J)
-        v_accel_bt(I,j) = v_accel_bt(I,j) + wt_accel(n) * (Cor + gradP)
-        if (find_PF)  PFv_bt_sum(i,J)  = PFv_bt_sum(i,J) + wt_accel2(n) * gradP
-        if (find_Cor) Corv_bt_sum(i,J) = Corv_bt_sum(i,J) + wt_accel2(n) * Cor
-
-        if (apply_v_OBCs) then ; if (BT_OBC%OBC_mask_v(i,J)) cycle ; endif
-
-        vel_prev = vbt(i,J)
-        vbt(i,J) = bt_rem_v(i,J) * (vbt(i,J) + &
-                        dtbt * ((BT_force_v(i,J) + Cor) + gradP))
-        if (project_velocity) then
-          vel_trans = (1.0 + be_proj)*vbt(i,J) - be_proj*vel_prev
-        else
-          vel_trans = (1.0-bebt)*vel_prev + bebt*vbt(i,J)
-        endif
-         if (use_BT_cont) then
-          vhbt(i,J) = find_vhbt(vel_trans,BTCL_v(i,J)) + vhbt0(i,J)
-        else
-          vhbt(i,J) = Datv(i,J)*vel_trans + vhbt0(i,J)
-        endif
+                 dgeo_de * CS%IdyCv(i,J)
+      enddo; enddo
 
-        vbt_sum(i,J) = vbt_sum(i,J) + wt_trans(n) * vel_trans
+      if (CS%dynamic_psurf) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          gradP_2D(i,j) = gradP_2D(i,j) + (p_surf_dyn(i,j) - p_surf_dyn(i,j+1)) * CS%IdyCv(i,J)
+        enddo; enddo
+      endif
+!$OMP do
+      do J=jsv-1,jev ; do i=isv,iev
+        v_accel_bt(I,j) = v_accel_bt(I,j) + wt_accel(n) * (Cor_2D(i,j) + gradP_2D(i,j))
+      enddo; enddo
+      if (find_PF) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          PFv_bt_sum(i,J)  = PFv_bt_sum(i,J) + wt_accel2(n) * gradP_2D(i,j)
+        enddo; enddo
+      endif
+      if (find_Cor) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          Corv_bt_sum(i,J) = Corv_bt_sum(i,J) + wt_accel2(n) * Cor_2D(i,j)
+        enddo; enddo
+      endif
+      if(apply_v_OBCs) then  ! save the old value of vbt and vhbt
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          vbt_prev(i,J) = vbt(i,J); vhbt_prev(i,J) = vhbt(i,J)
+          vbt_sum_prev(i,J)=vbt_sum(i,J); vhbt_sum_prev(i,J)=vhbt_sum(i,J) ; vbt_wtd_prev(i,J)=vbt_wtd_prev(i,J)
+        enddo ; enddo
+      endif
+      if (project_velocity) then
+!$OMP do
+         do J=jsv-1,jev ; do i=isv,iev
+            vel_prev = vbt(i,J)
+            vbt(i,J) = bt_rem_v(i,J) * (vbt(i,J) + &
+                 dtbt * ((BT_force_v(i,J) + Cor_2D(i,j)) + gradP_2D(i,j)))
+            vel_trans_2D(i,j) = (1.0 + be_proj)*vbt(i,J) - be_proj*vel_prev
+         enddo; enddo
+      else
+!$OMP do
+         do J=jsv-1,jev ; do i=isv,iev
+            vel_prev = vbt(i,J)
+            vbt(i,J) = bt_rem_v(i,J) * (vbt(i,J) + &
+                 dtbt * ((BT_force_v(i,J) + Cor_2D(i,j)) + gradP_2D(i,j)))
+            vel_trans_2D(i,j) = (1.0-bebt)*vel_prev + bebt*vbt(i,J)
+         enddo; enddo
+      endif
+      if (use_BT_cont) then
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          vhbt(i,J) = find_vhbt(vel_trans_2D(i,j),BTCL_v(i,J)) + vhbt0(i,J)
+        enddo; enddo
+      else
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          vhbt(i,J) = Datv(i,J)*vel_trans_2D(i,j) + vhbt0(i,J)
+        enddo; enddo
+      endif
+!$OMP do
+      do J=jsv-1,jev ; do i=isv,iev
+        vbt_sum(i,J) = vbt_sum(i,J) + wt_trans(n) * vel_trans_2D(i,j)
         vhbt_sum(i,J) = vhbt_sum(i,J) + wt_trans(n) * vhbt(i,J)
         vbt_wtd(i,J) = vbt_wtd(i,J) + wt_vel(n) * vbt(i,J)
-      enddo ; enddo
+      enddo; enddo
+
+      if(apply_v_OBCs) then  ! copy back the value for the points that OBC_mask_v is true.
+!$OMP do
+        do J=jsv-1,jev ; do i=isv,iev
+          if (BT_OBC%OBC_mask_v(i,J)) then          
+            vbt(i,J) = vbt_prev(i,J); vhbt(i,J) = vhbt_prev(i,J)
+            vbt_sum(i,J)=vbt_sum_prev(i,J); vhbt_sum(i,J)=vhbt_sum_prev(i,J) ; vbt_wtd(i,J)=vbt_wtd_prev(i,J)
+          endif
+        enddo ; enddo
+      endif      
     endif
 !$OMP end parallel
 
@@ -2486,6 +2822,7 @@ subroutine btcalc(h, G, CS, h_u, h_v, may_use_default)
   ! points, using a harmonic mean estimate.
 !$OMP parallel do default(none) shared(is,ie,js,je,nz,h_u,CS,h_neglect,h,use_default,G) &
 !$OMP                          private(hatutot,Ihatutot,e_u,D_shallow_u,h_arith,h_harm,wt_arith)
+
   do j=js-1,je+1
     if (present(h_u)) then
       do I=is-2,ie+1 ; hatutot(I) = h_u(i,j,1) ; enddo
@@ -2548,7 +2885,7 @@ subroutine btcalc(h, G, CS, h_u, h_v, may_use_default)
   enddo
 
 !$OMP parallel do default(none) shared(is,ie,js,je,nz,CS,G,h_v,h_neglect,h,use_default) &
-!$OMP                          private(hatvtot,Ihatvtot,e_v,D_shallow_v,h_arith,h_harm,wt_arith)
+!$OMP                          private(hatvtot,Ihatvtot,e_v,D_shallow_v,h_arith,h_harm,wt_arith) 
   do J=js-2,je+1
     if (present(h_v)) then
       do i=is-1,ie+1 ; hatvtot(i) = h_v(i,j,1) ; enddo
@@ -2881,23 +3218,34 @@ subroutine set_local_BT_cont_types(BT_cont, BTCL_u, BTCL_v, G, MS, BT_Domain, ha
   hs = 1 ; if (present(halo)) hs = max(halo,0)
 
   ! Copy the BT_cont arrays into symmetric, potentially wide haloed arrays.
-  u_polarity(:,:) = 1.0
-  uBT_EE(:,:) = 0.0 ; uBT_WW(:,:) = 0.0 
-  FA_u_EE(:,:) = 0.0 ; FA_u_E0(:,:) = 0.0 ; FA_u_W0(:,:) = 0.0 ; FA_u_WW(:,:) = 0.0
-  do I=is-1,ie ; do j=js,je
+!$OMP parallel default(none) shared(is,ie,js,je,hs,u_polarity,uBT_EE,uBT_WW,FA_u_EE, &
+!$OMP                               FA_u_E0,FA_u_W0,FA_u_WW,v_polarity,vBT_NN,vBT_SS,&
+!$OMP                               FA_v_NN,FA_v_N0,FA_v_S0,FA_v_SS,BT_cont )
+!$OMP do
+  do j=js-hs,je+hs ; do i=is-hs-1,ie+hs
+    u_polarity(i,j) = 1.0
+    uBT_EE(i,j) = 0.0 ; uBT_WW(i,j) = 0.0 
+    FA_u_EE(i,j) = 0.0 ; FA_u_E0(i,j) = 0.0 ; FA_u_W0(i,j) = 0.0 ; FA_u_WW(i,j) = 0.0
+  enddo; enddo
+!$OMP do
+  do j=js-hs-1,je+hs ; do i=is-hs,ie+hs
+    v_polarity(i,j) = 1.0
+    vBT_NN(i,j) = 0.0 ; vBT_SS(i,j) = 0.0
+    FA_v_NN(i,j) = 0.0 ; FA_v_N0(i,j) = 0.0 ; FA_v_S0(i,j) = 0.0 ; FA_v_SS(i,j) = 0.0
+  enddo; enddo
+!$OMP do
+  do j=js,je; do I=is-1,ie
     uBT_EE(I,j) = BT_cont%uBT_EE(I,j) ; uBT_WW(I,j) = BT_cont%uBT_WW(I,j)
     FA_u_EE(I,j) = BT_cont%FA_u_EE(I,j) ; FA_u_E0(I,j) = BT_cont%FA_u_E0(I,j)
     FA_u_W0(I,j) = BT_cont%FA_u_W0(I,j) ; FA_u_WW(I,j) = BT_cont%FA_u_WW(I,j)
   enddo ; enddo
-
-  v_polarity(:,:) = 1.0
-  vBT_NN(:,:) = 0.0 ; vBT_SS(:,:) = 0.0 
-  FA_v_NN(:,:) = 0.0 ; FA_v_N0(:,:) = 0.0 ; FA_v_S0(:,:) = 0.0 ; FA_v_SS(:,:) = 0.0
-  do i=is,ie ; do J=js-1,je
+!$OMP do
+  do J=js-1,je; do i=is,ie
     vBT_NN(i,J) = BT_cont%vBT_NN(i,J) ; vBT_SS(i,J) = BT_cont%vBT_SS(i,J)
     FA_v_NN(i,J) = BT_cont%FA_v_NN(i,J) ; FA_v_N0(i,J) = BT_cont%FA_v_N0(i,J)
     FA_v_S0(i,J) = BT_cont%FA_v_S0(i,J) ; FA_v_SS(i,J) = BT_cont%FA_v_SS(i,J)
   enddo ; enddo
+!$OMP end parallel
 
   if (id_clock_calc_pre > 0) call cpu_clock_end(id_clock_calc_pre)
   if (id_clock_pass_pre > 0) call cpu_clock_begin(id_clock_pass_pre)
@@ -2913,6 +3261,11 @@ subroutine set_local_BT_cont_types(BT_cont, BTCL_u, BTCL_v, G, MS, BT_Domain, ha
   if (id_clock_pass_pre > 0) call cpu_clock_end(id_clock_pass_pre)
   if (id_clock_calc_pre > 0) call cpu_clock_begin(id_clock_calc_pre)
 
+!$OMP parallel default(none) shared(is,ie,js,je,hs,BTCL_u,FA_u_EE,FA_u_E0,FA_u_W0, &
+!$OMP                               FA_u_WW,uBT_EE,uBT_WW,u_polarity,BTCL_v,       &
+!$OMP                               FA_v_NN,FA_v_N0,FA_v_S0,FA_v_SS,vBT_NN,vBT_SS, &
+!$OMP                               v_polarity )
+!$OMP do
   do j=js-hs,je+hs ; do I=is-hs-1,ie+hs
     BTCL_u(I,j)%FA_u_EE = FA_u_EE(I,j) ; BTCL_u(I,j)%FA_u_E0 = FA_u_E0(I,j)
     BTCL_u(I,j)%FA_u_W0 = FA_u_W0(I,j) ; BTCL_u(I,j)%FA_u_WW = FA_u_WW(I,j)
@@ -2935,7 +3288,7 @@ subroutine set_local_BT_cont_types(BT_cont, BTCL_u, BTCL_v, G, MS, BT_Domain, ha
     if (abs(BTCL_u(I,j)%uBT_EE) > 0.0) BTCL_u(I,j)%uh_crvE = &
       (C1_3 * (BTCL_u(I,j)%FA_u_EE - BTCL_u(I,j)%FA_u_E0)) / BTCL_u(I,j)%uBT_EE**2
   enddo ; enddo
-
+!$OMP do
   do J=js-hs-1,je+hs ; do i=is-hs,ie+hs
     BTCL_v(i,J)%FA_v_NN = FA_v_NN(i,J) ; BTCL_v(i,J)%FA_v_N0 = FA_v_N0(i,J)
     BTCL_v(i,J)%FA_v_S0 = FA_v_S0(i,J) ; BTCL_v(i,J)%FA_v_SS = FA_v_SS(i,J)
@@ -2958,7 +3311,7 @@ subroutine set_local_BT_cont_types(BT_cont, BTCL_u, BTCL_v, G, MS, BT_Domain, ha
     if (abs(BTCL_v(i,J)%vBT_NN) > 0.0) BTCL_v(i,J)%vh_crvN = &
       (C1_3 * (BTCL_v(i,J)%FA_v_NN - BTCL_v(i,J)%FA_v_N0)) / BTCL_v(i,J)%vBT_NN**2
   enddo ; enddo
-
+!$OMP end parallel
 end subroutine set_local_BT_cont_types
 
 subroutine BT_cont_to_face_areas(BT_cont, Datu, Datv, G, MS, halo, maximize)
@@ -2994,6 +3347,7 @@ subroutine BT_cont_to_face_areas(BT_cont, Datu, Datv, G, MS, halo, maximize)
       Datv(i,J) = 0.5 * (BT_cont%FA_v_N0(i,J) + BT_cont%FA_v_S0(i,J))
     enddo ; enddo
   endif
+
 end subroutine BT_cont_to_face_areas
 
 subroutine swap(a,b)
diff --git a/src/equation_of_state/MOM_EOS.F90 b/src/equation_of_state/MOM_EOS.F90
index 2b1e2522c5..eb6ebd8a1c 100644
--- a/src/equation_of_state/MOM_EOS.F90
+++ b/src/equation_of_state/MOM_EOS.F90
@@ -972,13 +972,21 @@ subroutine int_density_dz_generic_plm (T_t, T_b, S_t, S_b, z_t, z_b, rho_ref, &
 !  (in,opt) useMassWghtInterp - If true, uses mass weighting to interpolate
 !                       T/S for top and bottom integrals.
 
-  real :: T5(5), S5(5), p5(5), r5(5)
-  real :: T15(15), S15(15), p15(15), r15(15)
+  real :: T5((5*G%iscB+1):(5*(G%iecB+2)))
+  real :: S5((5*G%iscB+1):(5*(G%iecB+2)))
+  real :: p5((5*G%iscB+1):(5*(G%iecB+2)))
+  real :: r5((5*G%iscB+1):(5*(G%iecB+2)))
+  real :: u5((5*G%iscB+1):(5*(G%iecB+2)))
+  real :: T15((15*G%iscB+1):(15*(G%iecB+1)))
+  real :: S15((15*G%iscB+1):(15*(G%iecB+1)))
+  real :: p15((15*G%iscB+1):(15*(G%iecB+1)))
+  real :: r15((15*G%iscB+1):(15*(G%iecB+1)))
+  real :: wt_t(5), wt_b(5)
   real :: rho_anom
   real :: w_left, w_right, intz(5)
   real, parameter :: C1_90 = 1.0/90.0  ! Rational constants.
   real :: GxRho, I_Rho
-  real :: dz, dz_x(5), dz_y(5)
+  real :: dz(G%iscB:G%iecB+1), dz_x(5,G%iscB:G%iecB), dz_y(5,G%isc:G%iec)
   real :: weight_t, weight_b, hWght, massWeightingToggle
   real :: Ttl, Tbl, Ttr, Tbr, Stl, Sbl, Str, Sbr, hL, hR, iDenom
   integer :: Isq, Ieq, Jsq, Jeq, i, j, m, n
@@ -993,38 +1001,47 @@ subroutine int_density_dz_generic_plm (T_t, T_b, S_t, S_b, z_t, z_b, rho_ref, &
     if (useMassWghtInterp) massWeightingToggle = 1.
   endif
 
+  do n = 1, 5 
+     wt_t(n) = 0.25 * real(5-n)
+     wt_b(n) = 1.0 - wt_t(n)
+  enddo
+
   ! =============================
   ! 1. Compute vertical integrals
   ! =============================
-  do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
-    dz = z_t(i,j) - z_b(i,j)
+  do j=Jsq,Jeq+1
+     do i = Isq,Ieq+1
+        dz(i) = z_t(i,j) - z_b(i,j)
     do n=1,5
-      weight_t = 0.25 * real(5-n)
-      weight_b = 1.0 - weight_t
-      
-      p5(n) = -GxRho*(z_t(i,j) - 0.25*real(n-1)*dz)
+           p5(i*5+n) = -GxRho*(z_t(i,j) - 0.25*real(n-1)*dz(i))
       ! Salinity and temperature points are linearly interpolated
-      S5(n) = weight_t * S_t(i,j) + weight_b * S_b(i,j)
-      T5(n) = weight_t * T_t(i,j) + weight_b * T_b(i,j)
+           S5(i*5+n) = wt_t(n) * S_t(i,j) + wt_b(n) * S_b(i,j)
+           T5(i*5+n) = wt_t(n) * T_t(i,j) + wt_b(n) * T_b(i,j)
+      enddo
     enddo
-    call calculate_density(T5, S5, p5, r5, 1, 5, EOS)
+    call calculate_density_array(T5, S5, p5, r5, 1, (ieq-isq+2)*5, EOS )
+    u5 = r5 - rho_ref
     
+    do i = isq, ieq+1
     ! Use Bode's rule to estimate the pressure anomaly change.
-    rho_anom = C1_90*(7.0*(r5(1)+r5(5)) + 32.0*(r5(2)+r5(4)) + 12.0*r5(3)) - &
+       rho_anom = C1_90*(7.0*(r5(i*5+1)+r5(i*5+5)) + 32.0*(r5(i*5+2)+r5(i*5+4)) + 12.0*r5(i*5+3)) - &
            rho_ref
-    dpa(i,j) = G_e*dz*rho_anom
+       dpa(i,j) = G_e*dz(i)*rho_anom
+       if(present(intz_dpa)) then
     ! Use a Bode's-rule-like fifth-order accurate estimate of 
     ! the double integral of the pressure anomaly.
-    r5 = r5 - rho_ref
-    if (present(intz_dpa)) intz_dpa(i,j) = 0.5*G_e*dz**2 * &
-          (rho_anom - C1_90*(16.0*(r5(4)-r5(2)) + 7.0*(r5(5)-r5(1))) )
-  enddo ; enddo ! end loops on j and i
+          intz_dpa(i,j) = 0.5*G_e*dz(i)**2 * &
+                (rho_anom - C1_90*(16.0*(u5(i*5+4)-u5(i*5+2)) + 7.0*(u5(i*5+5)-u5(i*5+1))) )
+       endif
+    enddo
+  enddo ! end loops on j 
+
 
   ! ==================================================
   ! 2. Compute horizontal integrals in the x direction
   ! ==================================================
-  if (present(intx_dpa)) then ; do j=G%jsc,G%jec ; do I=Isq,Ieq
-    intz(1) = dpa(i,j) ; intz(5) = dpa(i+1,j)
+  if (present(intx_dpa)) then ; do j=G%jsc,G%jec 
+     do I=Isq,Ieq
 
     ! Corner values of T and S
     ! hWght is the distance measure by which the cell is violation of
@@ -1054,14 +1071,14 @@ subroutine int_density_dz_generic_plm (T_t, T_b, S_t, S_b, z_t, z_b, rho_ref, &
     endif
 
     do m=2,4
-      w_left = 0.25*real(5-m) ; w_right = 1.0-w_left  ! = 0.25*real(m-1)
-      dz_x(m) = w_left*(z_t(i,j) - z_b(i,j)) + w_right*(z_t(i+1,j) - z_b(i+1,j))
+           w_left = 0.25*real(5-m) ; w_right = 1.0-w_left
+           dz_x(m,i) = w_left*(z_t(i,j) - z_b(i,j)) + w_right*(z_t(i+1,j) - z_b(i+1,j))
 
       ! Salinity and temperature points are linearly interpolated in
       ! the horizontal. The subscript (1) refers to the top value in
       ! the vertical profile while subscript (5) refers to the bottom
       ! value in the vertical profile.
-      pos = (m-2)*5
+           pos = i*15+(m-2)*5
       T15(pos+1) = w_left*Ttl + w_right*Ttr
       T15(pos+5) = w_left*Tbl + w_right*Tbr
 
@@ -1072,37 +1089,41 @@ subroutine int_density_dz_generic_plm (T_t, T_b, S_t, S_b, z_t, z_b, rho_ref, &
 
       ! Pressure
       do n=2,5
-        p15(pos+n) = p15(pos+n-1) + GxRho*0.25*dz_x(m)
+              p15(pos+n) = p15(pos+n-1) + GxRho*0.25*dz_x(m,i)
       enddo
 
       ! Salinity and temperature (linear interpolation in the vertical)
-      do n=1,5
+           do n=2,4
         weight_t = 0.25 * real(5-n)
         weight_b = 1.0 - weight_t
         S15(pos+n) = weight_t * S15(pos+1) + weight_b * S15(pos+5)
         T15(pos+n) = weight_t * T15(pos+1) + weight_b * T15(pos+5)
       enddo
     enddo
+    enddo
 
-    call calculate_density(T15, S15, p15, r15, 1, 15, EOS)
+    call calculate_density(T15, S15, p15, r15, 1, 15*(ieq-isq+1), EOS)
+
+    do I=Isq,Ieq
+       intz(1) = dpa(i,j) ; intz(5) = dpa(i+1,j)    
     
     ! Use Bode's rule to estimate the pressure anomaly change.
     do m = 2,4
-      pos = (m-2)*5
-      intz(m) = G_e*dz_x(m)*( C1_90*(7.0*(r15(pos+1)+r15(pos+5)) + 32.0*(r15(pos+2)+r15(pos+4)) + &
+          pos = i*15+(m-2)*5
+          intz(m) = G_e*dz_x(m,i)*( C1_90*(7.0*(r15(pos+1)+r15(pos+5)) + 32.0*(r15(pos+2)+r15(pos+4)) + &
                             12.0*r15(pos+3)) - rho_ref)
     enddo    
     ! Use Bode's rule to integrate the bottom pressure anomaly values in x.
     intx_dpa(i,j) = C1_90*(7.0*(intz(1)+intz(5)) + 32.0*(intz(2)+intz(4)) + &
                            12.0*intz(3))
-  enddo ; enddo ; endif
+    enddo
+  enddo ; endif
 
   ! ==================================================
   ! 3. Compute horizontal integrals in the y direction
   ! ==================================================
-  if (present(inty_dpa)) then ; do J=Jsq,Jeq ; do i=G%isc,G%iec
-    intz(1) = dpa(i,j) ; intz(5) = dpa(i,j+1)
-
+  if (present(inty_dpa)) then ; do J=Jsq,Jeq 
+     do i=G%isc,G%iec
     ! Corner values of T and S
     ! hWght is the distance measure by which the cell is violation of
     ! hydrostatic consistency. For large hWght we bias the interpolation
@@ -1131,14 +1152,14 @@ subroutine int_density_dz_generic_plm (T_t, T_b, S_t, S_b, z_t, z_b, rho_ref, &
     endif
 
     do m=2,4
-      w_left = 0.25*real(5-m) ; w_right = 1.0-w_left  ! = 0.25*real(m-1)
-      dz_y(m) = w_left*(z_t(i,j) - z_b(i,j)) + w_right*(z_t(i,j+1) - z_b(i,j+1))
+           w_left = 0.25*real(5-m) ; w_right = 1.0-w_left
+           dz_y(m,i) = w_left*(z_t(i,j) - z_b(i,j)) + w_right*(z_t(i,j+1) - z_b(i,j+1))
 
       ! Salinity and temperature points are linearly interpolated in
       ! the horizontal. The subscript (1) refers to the top value in
       ! the vertical profile while subscript (5) refers to the bottom
       ! value in the vertical profile.
-      pos = (m-2)*5
+           pos = i*15+(m-2)*5
       T15(pos+1) = w_left*Ttl + w_right*Ttr
       T15(pos+5) = w_left*Tbl + w_right*Tbr
       
@@ -1149,31 +1170,37 @@ subroutine int_density_dz_generic_plm (T_t, T_b, S_t, S_b, z_t, z_b, rho_ref, &
 
       ! Pressure
       do n=2,5
-        p15(pos+n) = p15(pos+n-1) + GxRho*0.25*dz_y(m)
+              p15(pos+n) = p15(pos+n-1) + GxRho*0.25*dz_y(m,i)
       enddo
 
       ! Salinity and temperature (linear interpolation in the vertical)
-      do n=1,5
+           do n=2,4
         weight_t = 0.25 * real(5-n)
         weight_b = 1.0 - weight_t
         S15(pos+n) = weight_t * S15(pos+1) + weight_b * S15(pos+5)
         T15(pos+n) = weight_t * T15(pos+1) + weight_b * T15(pos+5)
       enddo
     enddo      
+     enddo
 
-    call calculate_density(T15, S15, p15, r15, 1, 15, EOS)
+     call calculate_density_array(T15(15*G%isc+1:), S15(15*G%isc+1:), p15(15*G%isc+1:), &
+                 r15(15*G%isc+1:), 1, 15*(G%iec-G%isc+1), EOS)
+     do i=G%isc,G%iec
+        intz(1) = dpa(i,j) ; intz(5) = dpa(i,j+1)        
     
     ! Use Bode's rule to estimate the pressure anomaly change.
     do m = 2,4
-      pos = (m-2)*5
-      intz(m) = G_e*dz_y(m)*( C1_90*(7.0*(r15(pos+1)+r15(pos+5)) + 32.0*(r15(pos+2)+r15(pos+4)) + &
+           pos = i*15+(m-2)*5
+           intz(m) = G_e*dz_y(m,i)*( C1_90*(7.0*(r15(pos+1)+r15(pos+5)) + 32.0*(r15(pos+2)+r15(pos+4)) + &
                             12.0*r15(pos+3)) - rho_ref)
     enddo
-
     ! Use Bode's rule to integrate the values.
     inty_dpa(i,j) = C1_90*(7.0*(intz(1)+intz(5)) + 32.0*(intz(2)+intz(4)) + &
                            12.0*intz(3))
-  enddo ; enddo ; endif
+     enddo
+  enddo ; endif
+
+
 end subroutine int_density_dz_generic_plm
 ! ==========================================================================
 ! Above is the routine where only the S and T profiles are modified