From 062489f1b0af23fd05b406732f3f58ad6ab07b2f Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Sun, 29 Mar 2020 14:30:44 -0600
Subject: [PATCH 01/13] Add cv_options=5 BE generation capability

modified:   README.gen_be_v3
modified:   gen_be_v3.F90
new file:   util/combine_be_cv5.f90
---
 var/gen_be_v3/README.gen_be_v3        |   10 +-
 var/gen_be_v3/gen_be_v3.F90           | 2873 +++++++++++++++++++++++--
 var/gen_be_v3/util/combine_be_cv5.f90 |  293 +++
 3 files changed, 3033 insertions(+), 143 deletions(-)
 create mode 100644 var/gen_be_v3/util/combine_be_cv5.f90

diff --git a/var/gen_be_v3/README.gen_be_v3 b/var/gen_be_v3/README.gen_be_v3
index 1d35f08654..e552e263f8 100644
--- a/var/gen_be_v3/README.gen_be_v3
+++ b/var/gen_be_v3/README.gen_be_v3
@@ -1,6 +1,5 @@
 
 *** gen_be_v3.F90 is a user-contribute program and is provided as it is. ***
-*** gen_be_v3.F90 is for univariate processing only. ***
 *** gen_be_v3.F90 is for bin_type=5 (domain-average BE statistics) only. ***
 
 1. To compile:
@@ -49,6 +48,7 @@
    be_method = 'NMC'
    varnames = 'T', 'U', 'V', 'RH', 'PSFC' ! for cv_options=7.
                                           ! can also be 'QCLOUD', 'QRAIN', 'QICE', 'QSNOW', 'QGRAUP', 'W'
+   !varnames = 'PSI', 'CHI_U', 'T_U', 'RH', 'PSFC_U' ! for cv_options=5
    do_pert_calc = .true.
    do_slen_calc = .true.
    slen_opt = 2               ! 1: curve-fitting method (extremely slow, need to run with OPENMP on)
@@ -60,6 +60,7 @@
                               ! 2: read from pert1 file when need it
                               !    This option writes out additional vertical-mode-projected fields when do_eof_transform=.true.
                               ! Set pert1_read_opt=2 for large number of cases and large domain sizes that encounter memory insufficiency.
+                              ! Note: when unbalanced variables are requested in varnames, pert1_read_opt=2 is used regardless of the namelist setting.
 /
 &ens_nml
 /
@@ -68,11 +69,10 @@
 
       Each variable is in separate output file.
 
-      For cv_options=7 purpose, util/combine_be_cv7.f90 can be used to combine be_[T/U/V/RH/PSFC].dat into be.dat that WRFDA can read.
-
       For cv_options=7 and cloud_cv_options=2 purpose, util/combine_be_cv7_ccv2.f90 can be used to
           combine be_[T/U/V/RH/PSFC/QCLOUD/QRAIN/QICE/QSNOW/QGRAUP/W].dat into be.dat that WRFDA can read.
 
-      pert1.* files are intermediate output that can be removed after the successful execution.
+      For cv_options=5 purpose, util/combine_be_cv5.f90 can be used to combine be_[PSI/CHI_U/T_U/RH/PSFC_U].dat and regcoeff[1/2/3].dat into be.dat that WRFDA can read.
+
+      pert* [psi.*] files are intermediate output that can be removed after the successful execution.
 
-      pertv_* files are intermediate pert1_read_opt=2 output that can be removed after the successful execution.
diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index 95f0a7b83f..64d5a567ca 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -94,14 +94,23 @@ program gen_be_v3
    integer :: nvar, nvar_avail
    integer :: mp_physics
    real    :: ds
+   character(len=VarNameLen) :: vartmp
+   integer :: nvar_all
 
-   logical, allocatable :: read_it(:)
+   logical, allocatable :: do_this_var(:)
    integer, allocatable :: var_dim(:)
 
    logical :: read_t
    logical :: read_qv
    logical :: read_prs
+   logical :: read_psfc
    logical :: read_ght = .false.
+   logical :: read_u
+   logical :: read_v
+   logical :: calc_psi
+   logical :: calc_chi
+   logical :: calc_regcoeff
+   integer :: indx_psi
 
    integer              :: nk_2d = 1
    integer              :: sl_unit = 77
@@ -113,7 +122,7 @@ program gen_be_v3
    integer, allocatable :: bin(:,:,:)
    integer, allocatable :: bin2d(:,:)
 
-   integer :: ii, i, j, n, k, ierr
+   integer :: ii, i, j, n, k, ierr, iv
    integer :: icase, icode
    integer :: istat
    logical :: isfile
@@ -144,9 +153,15 @@ program gen_be_v3
    real(r_double), allocatable :: r8tmp2d2(:,:)
    real(r_double), allocatable :: r8tmp3d(:,:,:)
 
+   ! for slen_opt=2
    real, allocatable :: mapfac_x(:,:)
    real, allocatable :: mapfac_y(:,:)
 
+   ! for psi/chi calculation
+   real, allocatable :: mapfac_m(:,:)
+   real, allocatable :: mapfac_u(:,:)
+   real, allocatable :: mapfac_v(:,:)
+
    real :: mean_scale
    real :: spike_tolerance = 1.5
    integer :: n_smth_sl = 2
@@ -218,7 +233,7 @@ program gen_be_v3
    if ( myproc == root ) write(stdout,*) 'num_procs = ', num_procs
    if ( myproc == root ) write(stdout,*) 'nvar = ', nvar
    if ( nvar > 0 ) then
-      allocate (read_it(nvar))
+      allocate (do_this_var(nvar))
       allocate (var_dim(nvar))
    else
       call error_handler(-1, 'varnames not set in namelist.gen_be_v3')
@@ -231,6 +246,22 @@ program gen_be_v3
       end if
    end do
 
+   ! convert varnames to be in upper case
+   do iv = 1, nvar
+      vartmp = varnames(iv)
+      do i = 1, len_trim(vartmp)
+         icode = ichar(vartmp(i:i))
+         if (icode>=97 .and. icode<=122) then
+            vartmp(i:i) = char(icode - 97 + 65)
+         end if
+      end do
+      varnames(iv) = vartmp
+      ! allow ps_u in user namelist, but set it to psfc_u internally
+      if ( trim(varnames(iv)) == 'PS_U' ) then
+         varnames(iv) = 'PSFC_U'
+      end if
+   end do
+
    ! convert be_method to be in upper case
    do i = 1, len_trim(be_method)
       icode = ichar(be_method(i:i))
@@ -260,7 +291,7 @@ program gen_be_v3
    ! as well as dimensions (ni, nj, nk), ds, and mp_physics option.
    call get_info(nc_list_file)
 
-   nvar_avail = count(read_it(:))
+   nvar_avail = count(do_this_var(:))
    if ( nvar_avail == 0 ) then
       call error_handler(-1, 'no valid variables found from the varnames in namelist.gen_be_v3')
    end if
@@ -280,6 +311,20 @@ program gen_be_v3
 
    if ( myproc == root ) write(stdout, '(a,3i5,f10.2)') ' ni, nj, nk, ds = ', ni, nj, nk, ds
 
+   ni1 = ni + 1
+   nj1 = nj + 1
+
+   if ( calc_psi .or. calc_chi ) then
+      allocate(mapfac_m(ni,nj))
+      allocate(mapfac_u(ni1,nj))
+      allocate(mapfac_v(ni,nj1))
+      if ( trim(aux_file) == 'none' ) then
+         ! read MAPFAC_M, MAPFAC_U and MAPFAC_V from file filenames(1,1)
+         aux_file = filenames(1,1)
+      end if
+      call read_fixed_fields_mapfac(trim(aux_file))
+   end if
+
    if ( do_slen_calc .and. slen_opt==2 ) then
       allocate(mapfac_x(ni,nj))
       allocate(mapfac_y(ni,nj))
@@ -296,7 +341,7 @@ program gen_be_v3
 
    call ext_ncd_ioexit(ierr)
 
-   if ( do_slen_calc ) then
+   if ( calc_regcoeff .or. do_slen_calc ) then
       ! hard-code bin_type to be 5
       ! set num_bins, num_bins2d, bin, bin2d with bin_type=5 values
       allocate(bin(ni,nj,nk))
@@ -307,7 +352,19 @@ program gen_be_v3
          bin(:,:,k) = k
       end do
       bin2d(:,:) = 1
+   end if
 
+   if ( calc_regcoeff ) then
+#ifdef DM_PARALLEL
+      call mpi_barrier(mpi_comm_world,ierr)
+#endif
+      call compute_regcoeff_unbalanced
+      ! force pert1_read_opt=2 in order to read in unbalanced fields that are in separate files
+      pert1_read_opt = 2
+      !if ( myproc == root ) write(stdout, '(a,3i5,f10.2)') ' --- Using pert1_read_opt=2 for unbalanced fields ---'
+   end if
+
+   if ( do_slen_calc ) then
       allocate( be_data(nvar) )
       do i = 1, nvar
          allocate( be_data(i)%e_vec(1:nk,1:nk) )
@@ -336,7 +393,7 @@ program gen_be_v3
          allocate(r8tmp3d(1:nk,1:nk,1:num_bins2d))
       end if
       do i = 1, nvar
-         if ( .not. read_it(i) ) cycle
+         if ( .not. do_this_var(i) ) cycle
          if ( myproc /= MOD((i-1), num_procs) ) cycle
 
          ! deal with zero scale_length
@@ -455,9 +512,15 @@ program gen_be_v3
       if ( allocated(mapfac_y) ) deallocate (mapfac_y)
    end if
 
+   if ( calc_psi .or. calc_chi ) then
+      if ( allocated(mapfac_m) ) deallocate (mapfac_m)
+      if ( allocated(mapfac_u) ) deallocate (mapfac_u)
+      if ( allocated(mapfac_v) ) deallocate (mapfac_v)
+   end if
+
    if ( allocated(filenames) ) deallocate(filenames)
    if ( allocated(filedates) ) deallocate(filedates)
-   if ( allocated(read_it) ) deallocate(read_it)
+   if ( allocated(do_this_var) ) deallocate(do_this_var)
    if ( allocated(var_dim) ) deallocate(var_dim)
 
 if ( myproc == root ) write(stdout,'(a)')' All Done!'
@@ -579,30 +642,75 @@ subroutine get_info(nc_list_file)
          read_t   = .false.
          read_qv  = .false.
          read_prs = .false.
+         read_psfc= .false.
+         read_u   = .false.
+         read_v   = .false.
+         calc_psi = .false.
+         calc_chi = .false.
+         calc_regcoeff = .false.
+         indx_psi = -1
          do i = 1, nvar
             call ext_ncd_get_var_info (fid, trim(varnames(i)), ndim, ordering, staggering, &
                                        start_index, end_index, wrftype, ierr)
             var_dim(i) = ndim
-            if ( ierr == 0 ) then
-               read_it(i) = .true.
+            if ( ierr == 0 ) then ! direct variables
+               do_this_var(i) = .true.
                if ( trim(varnames(i)) == 'T' ) then
                   read_prs = .true.
                end if
-            else
+            else ! possible derived variables
                if ( trim(varnames(i)) == 'RH' ) then
+                  do_this_var(i) = .true.
                   read_t   = .true.
                   read_qv  = .true.
                   read_prs = .true.
-                  read_it(i) = .true.
                   var_dim(i) = 3
+               else if ( trim(varnames(i)) == 'PSI' ) then
+                  do_this_var(i) = .true.
+                  read_u = .true.
+                  read_v = .true.
+                  var_dim(i) = 3
+                  calc_psi = .true.
+                  indx_psi = i
+               else if ( trim(varnames(i)) == 'CHI' ) then
+                  do_this_var(i) = .true.
+                  read_u = .true.
+                  read_v = .true.
+                  var_dim(i) = 3
+                  calc_chi = .true.
+               else if ( trim(varnames(i)) == 'CHI_U' ) then
+                  do_this_var(i) = .true.
+                  read_u = .true.
+                  read_v = .true.
+                  var_dim(i) = 3
+                  calc_psi = .true.
+                  calc_chi = .true.
+                  calc_regcoeff = .true.
+               else if ( trim(varnames(i)) == 'T_U' ) then
+                  do_this_var(i) = .true.
+                  read_prs = .true.
+                  read_t = .true.
+                  read_u = .true.
+                  read_v = .true.
+                  var_dim(i) = 3
+                  calc_psi = .true.
+                  calc_regcoeff = .true.
+               else if ( trim(varnames(i)) == 'PSFC_U' ) then
+                  do_this_var(i) = .true.
+                  read_psfc = .true.
+                  read_u = .true.
+                  read_v = .true.
+                  var_dim(i) = 2
+                  calc_psi = .true.
+                  calc_regcoeff = .true.
                else
-                  read_it(i) = .false.
+                  do_this_var(i) = .false.
                   if ( myproc == root ) write(stdout,'(3a)') &
-                     ' Warning: ', trim(varnames(i)), ' not available in the input file, skipping it'
+                     ' Warning: ', trim(varnames(i)), ' not available in the input file or not implemented, skipping it'
                end if
-            end if
-         end do
-      end if
+            end if ! ext_ncd_get_var_info check
+         end do ! nvar loop
+      end if ! one-time check
 
       call ext_ncd_ioclose(fid, ierr)
 
@@ -753,6 +861,94 @@ subroutine read_fixed_fields(input_file)
 
 end subroutine read_fixed_fields
 
+subroutine read_fixed_fields_mapfac(input_file)
+
+   implicit none
+
+   integer :: fid, ierr
+   character(len=*),intent(in)     :: input_file
+   character(len=DateStrLen)       :: DateStr
+   real(r_single), allocatable     :: xfield2d(:,:)
+   real(r_single), allocatable     :: xfield2d_u(:,:)
+   real(r_single), allocatable     :: xfield2d_v(:,:)
+   character(len=80), dimension(3) :: dimnames
+   character(len=4)                :: staggering = ' N/A' !dummy
+   character(len=3)                :: ordering
+   integer, dimension(4)           :: start_index, end_index
+   integer                         :: ndim, wrftype
+   character(len=512)              :: message
+   character(len=VarNameLen)       :: varname
+
+   !input_file = trim(filenames(1,1))
+
+   call ext_ncd_open_for_read(trim(input_file), 0, 0, "", fid, ierr)
+   write(message,'(a,i3,a,a)') ' Proc ', myproc, ' opening ', trim(input_file)
+   call error_handler(ierr, trim(message))
+
+   call ext_ncd_get_next_time(fid, DateStr, ierr)
+   call error_handler(ierr, 'ext_ncd_get_next_time')
+
+   if ( .not. allocated(xfield2d) ) allocate (xfield2d(ni, nj))
+
+   if ( .not. allocated(mapfac_m) ) allocate (mapfac_m(ni, nj))
+   varname = 'MAPFAC_M'
+   call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                              start_index, end_index, wrftype, ierr)
+   call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+   call ext_ncd_read_field(fid, DateStr, trim(varname),  &
+                           xfield2d(:,:), wrftype,       &
+                           0, 0, 0, ordering,            &
+                           staggering, dimnames,         & !dummy
+                           start_index, end_index,       & !dom
+                           start_index, end_index,       & !mem
+                           start_index, end_index,       & !pat
+                           ierr                   )
+   call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+   mapfac_m(:,:) = xfield2d(:,:)
+   deallocate (xfield2d)
+
+   if ( .not. allocated(xfield2d_u) ) allocate (xfield2d_u(ni1, nj))
+
+   if ( .not. allocated(mapfac_u) ) allocate (mapfac_u(ni1, nj))
+   varname = 'MAPFAC_U'
+   call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                              start_index, end_index, wrftype, ierr)
+   call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+   call ext_ncd_read_field(fid, DateStr, trim(varname),  &
+                           xfield2d_u(:,:), wrftype,     &
+                           0, 0, 0, ordering,            &
+                           staggering, dimnames,         & !dummy
+                           start_index, end_index,       & !dom
+                           start_index, end_index,       & !mem
+                           start_index, end_index,       & !pat
+                           ierr                   )
+   call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+   mapfac_u(:,:) = xfield2d_u(:,:)
+   deallocate (xfield2d_u)
+
+   if ( .not. allocated(xfield2d_v) ) allocate (xfield2d_v(ni, nj1))
+
+   if ( .not. allocated(mapfac_v) ) allocate (mapfac_v(ni, nj1))
+   varname = 'MAPFAC_V'
+   call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                              start_index, end_index, wrftype, ierr)
+   call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+   call ext_ncd_read_field(fid, DateStr, trim(varname),  &
+                           xfield2d_v(:,:), wrftype,     &
+                           0, 0, 0, ordering,            &
+                           staggering, dimnames,         & !dummy
+                           start_index, end_index,       & !dom
+                           start_index, end_index,       & !mem
+                           start_index, end_index,       & !pat
+                           ierr                   )
+   call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+   mapfac_v(:,:) = xfield2d_v(:,:)
+   deallocate (xfield2d_v)
+
+   call ext_ncd_ioclose(fid, ierr)
+
+end subroutine read_fixed_fields_mapfac
+
 subroutine compute_pert
 
    implicit none
@@ -790,6 +986,13 @@ subroutine compute_pert
    real(r_single), allocatable :: xfield_v(:,:,:)
    real(r_single), allocatable :: xfield_w(:,:,:)
 
+   real, allocatable :: vor(:,:,:)    ! vorticity
+   real, allocatable :: div(:,:,:)    ! divergence
+   real, allocatable :: psi(:,:,:)    ! stream function
+   real, allocatable :: chi(:,:,:)    ! velocity potential
+   real, allocatable :: ustag(:,:,:)  ! u on staggered grid
+   real, allocatable :: vstag(:,:,:)  ! v on staggered grid
+   real, allocatable :: psfc(:,:)     ! surface pressure
    real, allocatable :: prs(:,:,:)    ! prs on half levels
    real, allocatable :: prs_w(:,:,:)  ! prs on full levels
    real, allocatable :: ght(:,:,:)
@@ -800,7 +1003,7 @@ subroutine compute_pert
    real, allocatable :: znw(:)
    real    :: p_top
    real    :: tc, es, qs
-   logical :: got_prs, got_th, got_qv
+   logical :: got_prs, got_th, got_qv, got_u, got_v, got_psfc
 
    real, allocatable :: time_mean(:,:,:,:)
    real(r_double), allocatable :: xsum_loc(:,:,:,:), xsum(:,:,:,:)
@@ -867,13 +1070,22 @@ subroutine compute_pert
       ' Proc ', myproc, ' will read case files ', case_istart, ' - ', case_iend
 
    if ( ens_iend >= ens_istart .and. case_iend >= case_istart ) then
-      allocate(xdata(nvar,ens_istart:ens_iend,case_istart:case_iend), stat=ierr)
+      if ( calc_psi .and. indx_psi < 0 ) then
+         ! when psi is not in the var list but is needed for unbalanced fields
+         ! allocate index nvar+1 for psi
+         allocate(xdata(nvar+1,ens_istart:ens_iend,case_istart:case_iend), stat=ierr)
+         indx_psi = nvar + 1
+         nvar_all = nvar + 1
+      else
+         allocate(xdata(nvar,ens_istart:ens_iend,case_istart:case_iend), stat=ierr)
+         nvar_all = nvar
+      end if
       call error_handler(ierr, 'allocating xdata in compute_pert')
    end if
 
    do ic = case_istart, case_iend
       do ie = ens_istart, ens_iend
-         do iv = 1, nvar
+         do iv = 1, nvar_all
             allocate(xdata(iv,ie,ic)%value(ni,nj,nk), stat=ierr)
             call error_handler(ierr, 'allocating xdata%value in compute_pert')
             xdata(iv,ie,ic)%value(:,:,:) = 0.0
@@ -892,6 +1104,9 @@ subroutine compute_pert
 
          got_qv = .false.
          got_th = .false.
+         got_u  = .false.
+         got_v  = .false.
+         got_psfc = .false.
 
          call ext_ncd_open_for_read(trim(input_file), 0, 0, "", fid, ierr)
          write(message,'(a,i3,a,a)') ' Proc ', myproc, ' opening ', trim(input_file)
@@ -941,6 +1156,26 @@ subroutine compute_pert
             end if ! p, pb
          end if ! read_prs
 
+         if ( read_psfc ) then
+            if ( .not. allocated(xfield) ) allocate (xfield  (ni, nj, nk))
+            varname = 'PSFC'
+            call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                                       start_index, end_index, wrftype, ierr)
+            call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+            call ext_ncd_read_field(fid, DateStr, varname,     &
+                                    xfield(:,:,1), wrftype,    &
+                                    0, 0, 0, ordering,         &
+                                    staggering, dimnames,      & !dummy
+                                    start_index, end_index,    & !dom
+                                    start_index, end_index,    & !mem
+                                    start_index, end_index,    & !pat
+                                    ierr                   )
+            call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+            if ( .not. allocated(psfc) ) allocate (psfc(ni, nj))
+            psfc(:,:) = xfield(:,:,1)
+            got_psfc = .true.
+         end if
+
          if ( read_ght ) then
             if ( .not. allocated(ght) )      allocate (ght(ni, nj, nk))
             if ( .not. allocated(xfield_w) ) allocate (xfield_w(ni, nj ,nk1))
@@ -1086,7 +1321,7 @@ subroutine compute_pert
                                        start_index, end_index, wrftype, ierr)
             call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
             call ext_ncd_read_field(fid, DateStr, trim(varname),  &
-                                    xfield_w(1,1,:), wrftype,       &
+                                    xfield_w(1,1,:), wrftype,     &
                                     0, 0, 0, ordering,            &
                                     staggering, dimnames,         & !dummy
                                     start_index, end_index,       & !dom
@@ -1126,9 +1361,71 @@ subroutine compute_pert
             deallocate (znw)
          end if ! read_prs, got_prs
 
+         if ( read_u ) then
+            if ( .not. allocated(xfield_u) ) allocate (xfield_u(ni1,nj, nk))
+            if ( .not. allocated(ustag) ) allocate (ustag(ni1, nj, nk))
+            varname = 'U'
+            call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                                       start_index, end_index, wrftype, ierr)
+            call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+            call ext_ncd_read_field(fid, DateStr, varname,     &
+                                    xfield_u(:,:,:), wrftype,  &
+                                    0, 0, 0, ordering,         &
+                                    staggering, dimnames,      & !dummy
+                                    start_index, end_index,    & !dom
+                                    start_index, end_index,    & !mem
+                                    start_index, end_index,    & !pat
+                                    ierr                   )
+            call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+            ustag(:,:,:) = xfield_u(:,:,:)
+            got_u = .true.
+         end if
+
+         if ( read_v ) then
+            if ( .not. allocated(xfield_v) ) allocate (xfield_v(ni, nj1,nk))
+            if ( .not. allocated(vstag) ) allocate (vstag(ni, nj1, nk))
+            varname = 'V'
+            call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                                       start_index, end_index, wrftype, ierr)
+            call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+            call ext_ncd_read_field(fid, DateStr, varname,     &
+                                    xfield_v(:,:,:), wrftype,  &
+                                    0, 0, 0, ordering,         &
+                                    staggering, dimnames,      & !dummy
+                                    start_index, end_index,    & !dom
+                                    start_index, end_index,    & !mem
+                                    start_index, end_index,    & !pat
+                                    ierr                   )
+            call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+            vstag(:,:,:) = xfield_v(:,:,:)
+            got_v = .true.
+         end if
+
+         if ( calc_psi .and. got_u .and. got_v ) then
+            if ( .not. allocated(vor) ) allocate (vor(ni+1, nj+1, nk))
+            if ( .not. allocated(psi) ) allocate (psi(ni+1, nj+1, nk))
+            ! Calculate vorticity (in center of mass grid on WRF's Arakawa C-grid)
+            call da_uv_to_vor_c(ni, nj, nk, ds, &
+                                mapfac_m, mapfac_u, mapfac_v, ustag, vstag, vor)
+            ! Calculate streamfunction
+            ! Assumes vor converted to Del**2 psi
+            call da_del2a_to_a(ni+1, nj+1, nk, ds, vor, psi)
+         end if ! calc_psi
+
+         if ( calc_chi .and. got_u .and. got_v ) then
+            if ( .not. allocated(div) ) allocate (div(ni, nj, nk))
+            if ( .not. allocated(chi) ) allocate (chi(ni, nj, nk))
+            ! Calculate divergence (at mass pts. on WRF's Arakawa C-grid)
+            call da_uv_to_div_c(ni, nj, nk, ds, &
+                                 mapfac_m, mapfac_u, mapfac_v, ustag, vstag, div)
+            ! Calculate velocity potential
+            ! Assumes div converted to Del**2 chi
+            call da_del2a_to_a(ni, nj, nk, ds, div, chi)
+         end if ! calc_chi
+
          var_loop1: do iv = 1, nvar
 
-            if ( .not. read_it(iv) ) cycle var_loop1
+            if ( .not. do_this_var(iv) ) cycle var_loop1
 
             varname = trim(varnames(iv))
             call ext_ncd_get_var_info (fid, varname, ndim, ordering, staggering, &
@@ -1138,9 +1435,10 @@ subroutine compute_pert
             !write(stdout, '(a,i3,a,a8,a,a)') ' Proc ', myproc, ' Reading ', &
             !   trim(varname), ' from ', trim(input_file)
 
-            if ( varname == 'PSFC' ) then
+            if ( varname == 'PSFC' .or. varname == 'PSFC_U' ) then
                if ( .not. allocated(xfield) ) allocate (xfield  (ni, nj, nk))
-               call ext_ncd_read_field(fid, DateStr, varname,     &
+               if ( .not. got_psfc ) then
+                  call ext_ncd_read_field(fid, DateStr, varname,  &
                                        xfield(:,:,1), wrftype,    &
                                        0, 0, 0, ordering,         &
                                        staggering, dimnames,      & !dummy
@@ -1148,11 +1446,15 @@ subroutine compute_pert
                                        start_index, end_index,    & !mem
                                        start_index, end_index,    & !pat
                                        ierr                   )
-               call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+                  call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+               else
+                  xfield(:,:,1) = psfc(:,:)
+               end if
                xdata(iv,ie,ic)%value(:,:,1) = xfield(:,:,1)
             else if ( varname == 'U' ) then
                if ( .not. allocated(xfield_u) ) allocate (xfield_u(ni1,nj, nk))
-               call ext_ncd_read_field(fid, DateStr, varname,     &
+               if ( .not. got_u ) then
+                  call ext_ncd_read_field(fid, DateStr, varname,  &
                                        xfield_u(:,:,:), wrftype,  &
                                        0, 0, 0, ordering,         &
                                        staggering, dimnames,      & !dummy
@@ -1160,7 +1462,10 @@ subroutine compute_pert
                                        start_index, end_index,    & !mem
                                        start_index, end_index,    & !pat
                                        ierr                   )
-               call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+                  call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+               else
+                  xfield_u(:,:,:) = ustag(:,:,:)
+               end if
                do k = 1, nk
                   do j = 1, nj
                      do i = 1, ni
@@ -1171,7 +1476,8 @@ subroutine compute_pert
                end do
             else if ( varname == 'V' ) then
                if ( .not. allocated(xfield_v) ) allocate (xfield_v(ni, nj1,nk))
-               call ext_ncd_read_field(fid, DateStr, varname,     &
+               if ( .not. got_v ) then
+                  call ext_ncd_read_field(fid, DateStr, varname,  &
                                        xfield_v(:,:,:), wrftype,  &
                                        0, 0, 0, ordering,         &
                                        staggering, dimnames,      & !dummy
@@ -1179,7 +1485,10 @@ subroutine compute_pert
                                        start_index, end_index,    & !mem
                                        start_index, end_index,    & !pat
                                        ierr                   )
-               call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+                  call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+               else
+                  xfield_v(:,:,:) = vstag(:,:,:)
+               end if
                do k = 1, nk
                   do j = 1, nj
                      do i = 1, ni
@@ -1238,7 +1547,7 @@ subroutine compute_pert
                   xdata(iv,ie,ic)%value(:,:,:) = &
                      xfield(:,:,:) / ( 1.0 + xfield(:,:,:) )
                end if ! got_qv
-            else if ( varname == 'T' ) then
+            else if ( varname == 'T' .or. varname == 'T_U' ) then
                if ( got_th ) then
                   ! potential temperature to temperature
                   xdata(iv,ie,ic)%value(:,:,:) = &
@@ -1258,6 +1567,10 @@ subroutine compute_pert
                   xdata(iv,ie,ic)%value(:,:,:) = &
                      (t00+xfield(:,:,:))*(prs(:,:,:)/p00)**kappa
                end if ! got_th
+            else if ( varname == 'PSI' ) then
+               xdata(iv,ie,ic)%value(:,:,:) = psi(:,:,:)
+            else if ( varname == 'CHI' .or. varname == 'CHI_U' ) then
+               xdata(iv,ie,ic)%value(:,:,:) = chi(:,:,:)
             else
                if ( .not. allocated(xfield) ) allocate (xfield  (ni, nj, nk))
                call ext_ncd_read_field(fid, DateStr, varname,     &
@@ -1279,6 +1592,10 @@ subroutine compute_pert
 
          end do var_loop1 ! nvar loop
 
+         if ( indx_psi > nvar ) then
+            xdata(indx_psi,ie,ic)%value(:,:,:) = psi(:,:,:)
+         end if
+
          call ext_ncd_ioclose(fid, ierr)
 
       end do ens_loop1 ! ensemble member loop
@@ -1291,13 +1608,13 @@ subroutine compute_pert
          if ( remove_time_mean ) then
             ! allocate for summing up the cases each processor sees
             if ( .not. allocated(xsum_loc) )  then
-               allocate(xsum_loc(ni,nj,nk,nvar), stat=ierr)
+               allocate(xsum_loc(ni,nj,nk,nvar_all), stat=ierr)
                call error_handler(ierr, 'allocating xsum_loc')
                xsum_loc = 0.0
             end if
          end if
 
-         do iv = 1, nvar
+         do iv = 1, nvar_all
             ! store forecast difference in index 1 of ie
             xdata(iv,1,ic)%value(:,:,:) = xdata(iv,1,ic)%value(:,:,:) - xdata(iv,2,ic)%value(:,:,:)
             if ( remove_time_mean ) then
@@ -1329,12 +1646,12 @@ subroutine compute_pert
 
          if ( .not. allocated(ens_sum_loc) ) allocate(ens_sum_loc(ni,nj,nk))
          if ( .not. allocated(ens_sum) )     allocate(ens_sum(ni,nj,nk))
-         if ( .not. allocated(ens_mean) )    allocate(ens_mean(ni,nj,nk,nvar))
+         if ( .not. allocated(ens_mean) )    allocate(ens_mean(ni,nj,nk,nvar_all))
 
          if ( myproc == root ) write(stdout,'(a)') ' ====== Computing ensemble mean ======'
 
-         var_loop2: do iv = 1, nvar
-            if ( .not. read_it(iv) ) cycle var_loop2
+         var_loop2: do iv = 1, nvar_all
+            if ( (iv<=nvar) .and. (.not. do_this_var(iv)) ) cycle var_loop2
             ens_sum_loc(:,:,:) = 0.0 ! initialize for each variable
             ens_sum    (:,:,:) = 0.0 ! initialize for each variable
             do n = istart_ens(myproc),iend_ens(myproc)
@@ -1365,8 +1682,8 @@ subroutine compute_pert
          end if
 
          if ( myproc == root ) write(stdout,'(a)') ' ====== Computing ensemble perturbations ======'
-         do iv = 1, nvar
-            if ( .not. read_it(iv) ) cycle
+         do iv = 1, nvar_all
+            if ( (iv<=nvar) .and. (.not. do_this_var(iv)) ) cycle
             do ie = ens_istart, ens_iend ! each proc loops over a subset of ens
                if ( remove_ens_mean ) then
                   xdata(iv,ie,ic)%value(:,:,:) = xdata(iv,ie,ic)%value(:,:,:) - ens_mean(:,:,:,iv)
@@ -1378,7 +1695,7 @@ subroutine compute_pert
             if ( myproc == root ) write(stdout,'(a)') ' ====== Computing ensemble stdv ======'
             allocate(ens_stdv(ni,nj,nk,nvar))
             do iv = 1, nvar
-               if ( .not. read_it(iv) ) cycle
+               if ( .not. do_this_var(iv) ) cycle
                ens_sum_loc(:,:,:) = 0.0 ! initialize for each variable
                ens_sum    (:,:,:) = 0.0 ! initialize for each variable
                do n = istart_ens(myproc),iend_ens(myproc)
@@ -1401,6 +1718,21 @@ subroutine compute_pert
             deallocate(ens_sum)
          end if
 
+         if ( calc_regcoeff ) then
+            ! write out psi
+            do ie = ens_istart, ens_iend ! each proc loops over a subset of ens
+               write(ce,'(i3.3)') ie
+               output_file = 'psi.'//filedates(ie,ic)//'.e'//trim(ce)
+               !write(stdout,'(a,i3,a,a)') ' Proc', myproc, ' writing to ', trim(output_file)
+               open (ounit, file = output_file, form='unformatted')
+               write(ounit) 1, ni, nj, nk
+               write(ounit) 3
+               write(ounit) filedates(ie,ic), 'PSI       '
+               write(ounit) xdata(indx_psi,ie,ic)%value(:,:,:)
+               close(ounit)
+            end do
+         end if
+
          if ( write_pert1 .or. do_slen_calc ) then
             do ie = ens_istart, ens_iend ! each proc loops over a subset of ens
                write(ce,'(i3.3)') ie
@@ -1426,7 +1758,7 @@ subroutine compute_pert
                allocate(r8tmp(ni,nj,nk))
             end if
             do iv = 1, nvar
-               if ( .not. read_it(iv) ) cycle
+               if ( .not. do_this_var(iv) ) cycle
                do ie = ens_istart, ens_iend ! each proc loops over a subset of ens
                   write(ce,'(i3.3)') ie
                   !output_file = trim(varnames(iv))//'.e'//trim(ce)
@@ -1500,7 +1832,7 @@ subroutine compute_pert
 
             my_nvar = 0
             var_loop3: do iv = 1, nvar
-               if ( .not. read_it(iv) ) cycle var_loop3
+               if ( .not. do_this_var(iv) ) cycle var_loop3
 
                ! initialize to zero for each variable
                if ( myproc == root ) globuf(:,:,:,:) = 0.0
@@ -1612,6 +1944,13 @@ subroutine compute_pert
 
    end do case_loop1 ! case loop
 
+   if ( allocated(vor) )   deallocate (vor)
+   if ( allocated(div) )   deallocate (div)
+   if ( allocated(psi) )   deallocate (psi)
+   if ( allocated(chi) )   deallocate (chi)
+   if ( allocated(ustag) ) deallocate (ustag)
+   if ( allocated(vstag) ) deallocate (vstag)
+   if ( allocated(psfc) )  deallocate (psfc)
    if ( allocated(prs) )   deallocate (prs)
    if ( allocated(ght) )   deallocate (ght)
    if ( allocated(theta) ) deallocate (theta)
@@ -1623,22 +1962,22 @@ subroutine compute_pert
 
    if ( trim(be_method) == 'NMC' ) then
       if ( remove_time_mean ) then
-         if ( .not. allocated(time_mean) ) allocate(time_mean(ni,nj,nk,nvar))
+         if ( .not. allocated(time_mean) ) allocate(time_mean(ni,nj,nk,nvar_all))
 #ifdef DM_PARALLEL
-         if ( .not. allocated(xsum) )      allocate(xsum(ni,nj,nk,nvar))
-         do iv = 1, nvar
+         if ( .not. allocated(xsum) )      allocate(xsum(ni,nj,nk,nvar_all))
+         do iv = 1, nvar_all
             call mpi_allreduce(xsum_loc(:,:,:,iv),xsum(:,:,:,iv), ijk, &
                                mpi_real8, mpi_sum, mpi_comm_world,ierr)
             time_mean(:,:,:,iv) = xsum(:,:,:,iv)/real(ncase)
          end do
          deallocate(xsum)
 #else
-         do iv = 1, nvar
+         do iv = 1, nvar_all
             time_mean(:,:,:,iv) = xsum_loc(:,:,:,iv)/real(ncase)
          end do
 #endif
          do ic = case_istart, case_iend
-            do iv = 1, nvar
+            do iv = 1, nvar_all
                xdata(iv,1,ic)%value(:,:,:) = xdata(iv,1,ic)%value(:,:,:) - time_mean(:,:,:,iv)
             end do
          end do
@@ -1646,6 +1985,20 @@ subroutine compute_pert
          deallocate(xsum_loc)
       end if ! remove_time_mean
 
+      if ( calc_regcoeff ) then
+         ! write out psi
+         do ic = case_istart, case_iend
+            output_file = 'psi.'//filedates(1,ic)//'.e001'
+            !write(stdout,'(a,i3,a,a)') ' Proc', myproc, ' writing to ', trim(output_file)
+            open (ounit, file = output_file, form='unformatted')
+            write(ounit) 1, ni, nj, nk
+            write(ounit) 3
+            write(ounit) filedates(1,ic), 'PSI       '
+            write(ounit) xdata(indx_psi,1,ic)%value(:,:,:)
+            close(ounit)
+         end do
+      end if
+
       if ( write_pert1 .or. do_slen_calc ) then
          do ic = case_istart, case_iend
             output_file = 'pert1.'//filedates(1,ic)//'.e001'
@@ -1673,8 +2026,10 @@ subroutine compute_pert
 
    do ic = case_istart, case_iend
       do ie = ens_istart, ens_iend
-         do iv = 1, nvar
-            deallocate(xdata(iv,ie,ic)%value)
+         do iv = 1, nvar_all
+            if ( allocated(xdata(iv,ie,ic)%value) ) then
+               deallocate(xdata(iv,ie,ic)%value)
+            end if
          end do
       end do
    end do
@@ -1682,60 +2037,61 @@ subroutine compute_pert
 
 end subroutine compute_pert
 
-subroutine compute_bv_sl
+subroutine compute_regcoeff_unbalanced
+! adapted from var/gen_be/gen_be_stage2.f90 and var/gen_be/gen_be_stage2a.f90
 
-   !use da_lapack, only : dsyev
    implicit none
 
-   character(len=filename_len)  :: filename                   ! Input filename.
-   character(len=filename_len)  :: output_file
-   integer             :: i, j, k, k1, k2, b         ! Loop counters.
-   integer             :: ic, ie, iv, m
+   real, parameter     :: variance_threshold = 1e-6  ! Percentage of <psi psi> variance discarded.
+
+   character(len=filename_len) :: input_file, output_file
+   character*3         :: ce                         ! Member index -> character
+   integer             :: i, j, k, member, k2, k3, m ! Loop counters
+   integer             :: b                          ! Bin marker
+   integer             :: mmax                       ! Maximum mode (after variance truncation)
+   real                :: coeffa, coeffb             ! Accumulating mean coefficients
+   real                :: total_variance             ! Total variance of <psi psi> matrix
+   real                :: cumul_variance             ! Cumulative variance of <psi psi> matrix
+   real                :: summ                       ! Summation dummy
+
+   real, allocatable   :: psi(:,:,:)                 ! psi
+   real, allocatable   :: chi(:,:,:)                 ! chi
+   real, allocatable   :: temp(:,:,:)                ! Temperature
+   real, allocatable   :: ps(:,:,:)                  ! Surface pressure
+
+   integer, allocatable:: bin_pts(:)                 ! Number of points in bin (3D fields)
+   integer, allocatable:: bin_pts2d(:)               ! Number of points in bin (2D fields)
+   real, allocatable   :: covar1(:)                  ! Covariance between input fields
+   real, allocatable   :: covar2(:,:)                ! Covariance between input fields
+   real, allocatable   :: covar3(:,:,:)              ! Covariance between input fields
+   real, allocatable   :: var1(:)                    ! Autocovariance of field
+   real, allocatable   :: var2(:,:,:)                ! Autocovariance of field
+   real, allocatable   :: var2_inv(:,:,:)            ! Inverse Autocovariance of field
+
+   real, allocatable   :: work(:,:)                  ! EOF work array
+   real*8, allocatable :: evec(:,:)                  ! Gridpoint eigenvectors
+   real*8, allocatable :: eval(:)                    ! Gridpoint sqrt(eigenvalues)
+   real, allocatable   :: LamInvET(:,:)              ! ET/sqrt(Eigenvalue)
+   real, allocatable   :: regcoeff1(:)               ! psi/chi regression cooefficient
+   real, allocatable   :: regcoeff2(:,:)             ! psi/ps regression cooefficient
+   real, allocatable   :: regcoeff3(:,:,:)           ! psi/T regression cooefficient
+
+   integer :: ic, ie, iv
    integer :: istart_member, iend_member
-   real                :: inv_nij                    ! 1 / (ni*nj).
-   real                :: mean_field                 ! Mean field.
-   real                :: coeffa, coeffb             ! Accumulating mean coefficients.
-   logical             :: testing_eofs               ! True if testing EOF decomposition.
-   logical             :: use_global_eofs            ! True if projected data uses global EOFs.
+   integer :: iunit, ounit
 
-   integer, allocatable:: bin_pts2d(:)               ! Number of points in bin (2D fields).
-   real, allocatable   :: field(:,:,:)               ! Input field.
-   real, allocatable   :: fieldv(:,:,:)              ! projected field
-   real, allocatable   :: field2d(:,:)               ! Input field.
-   real, allocatable   :: bv(:,:,:)                  ! Vertical BE for this time.
-   real, allocatable   :: work(:,:)                  ! EOF work array.
-   real, allocatable   :: e_vec_loc(:,:,:)           ! Latitudinally varying eigenvectors.
-   real, allocatable   :: e_val_loc(:,:)             ! Latitudinally varying eigenvalues.
-   real*8, allocatable :: e_vec(:,:)                 ! Domain-averaged eigenvectors.
-   real*8, allocatable :: e_val(:)                   ! Domain-averaged eigenvalues.
-   real*8, allocatable :: evec(:,:,:)                ! Gridpoint eigenvectors.
-   real*8, allocatable :: eval(:,:)                  ! Gridpoint sqrt(eigenvalues).
-   real :: ETVp
-   character(len=DateStrLen)       :: DateStr_tmp
-   character(len=VarNameLen)       :: var_tmp
-   character(len=filename_len)     :: input_file
-   character(len=3)     :: ce
+   logical :: got_var2_inv
 
-   integer :: ndim
-   integer :: ijk
-   integer :: proc_send
-   integer :: nn
-   integer :: nvar_read, ni_read, nj_read, nk_read, nkk
-   integer :: k_start, k_end
-   integer, allocatable:: nr(:), icount(:)
-   real, allocatable   :: cov(:,:)
-   real, allocatable   :: ml(:), sl(:)
-   real, allocatable   :: sl_g(:)
-   real, allocatable   :: hl(:), hlt(:)
+   if ( myproc == root ) then
+      write(stdout,'(a)')' ====== Computing regcoeff and unbalanced fields ======'
+   end if
 
-   integer :: bv_unit = 22
-   integer :: iunit = 23
-   integer :: ounit = 24
-   testing_eofs = .false.
-   use_global_eofs = .true.
+   iunit = 23
+   ounit = 24
 
-   inv_nij = 1.0 / real(ni*nj)
-   allocate( field(1:ni,1:nj,1:nk) )
+   allocate( psi(1:ni,1:nj,1:nk) )
+   allocate( bin_pts(1:num_bins) )
+   allocate( bin_pts2d(1:num_bins2d) )
 
    if ( trim(be_method) == 'NMC' ) then
       istart_member = 1
@@ -1745,51 +2101,472 @@ subroutine compute_bv_sl
       iend_member   = nens
    end if
 
-   if ( pert1_read_opt == 1 ) then
+   var_loop: do iv = 1, nvar
 
-   allocate(xdata(1:nvar,istart_member:iend_member,1:ncase), stat=ierr)
-   call error_handler(ierr, 'allocating xdata in compute_bv_sl')
+      if ( .not. do_this_var(iv) ) cycle var_loop
+      if ( myproc /= MOD((iv-1), num_procs) ) cycle var_loop
+      if ( trim(varnames(iv)) /= 'CHI_U'  .and. &
+           trim(varnames(iv)) /= 'PSFC_U' .and. &
+           trim(varnames(iv)) /= 'T_U'         ) cycle var_loop
 
-   do ic = 1, ncase
-      do ie = istart_member, iend_member
-         write(ce,'(i3.3)') ie
-         input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
-         inquire(file=trim(input_file), exist=isfile)
-         if ( .not. isfile ) then
-            call error_handler(-1, trim(input_file)//' not found for getting perturbation')
+      !write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing regcoeff for variable ', trim(varnames(iv))
+
+      bin_pts(:) = 0
+      bin_pts2d(:) = 0
+
+      if ( trim(varnames(iv)) == 'CHI_U' ) then
+         if ( .not. allocated(chi) )    allocate( chi(1:ni,1:nj,1:nk) )
+         if ( .not. allocated(covar1) ) allocate( covar1(1:num_bins) )
+         if ( .not. allocated(var1) )   allocate( var1(1:num_bins) )
+         covar1(:) = 0.0
+         var1(:) = 0.0
+      end if
+
+      if ( trim(varnames(iv)) == 'PSFC_U' .or. trim(varnames(iv)) == 'T_U' ) then
+         if ( .not. allocated(var2) ) allocate( var2(1:nk,1:nk,1:num_bins2d) )
+         var2(:,:,:) = 0.0
+         if ( trim(varnames(iv)) == 'PSFC_U' ) then
+            ! still allocate 1:nk for ps even though only 1 will be used
+            ! this is needed to make get_pert1_data work for ps
+            if ( .not. allocated(ps) )     allocate( ps(1:ni,1:nj,1:nk) )
+            if ( .not. allocated(covar2) ) allocate( covar2(1:nk,1:num_bins2d) )
+            covar2(:,:) = 0.0
          end if
-         !write(stdout,'(a,i3,a,a)') ' Proc', myproc, ' Reading from ', trim(input_file)
-         open (iunit, file=trim(input_file), form='unformatted', status='old')
-         read(iunit) nvar_read, ni_read, nj_read, nk_read
-         read_loop: do iv = 1, nvar_read
-            read(iunit) ndim
-            read(iunit) DateStr_tmp, var_tmp
-            if ( ndim == 2 ) then
-               read(iunit) field(:,:,1)
-            else if ( ndim == 3 ) then
-               read(iunit) field(:,:,:)
-            end if
-            allocate(xdata(iv,ie,ic)%value(ni,nj,nk), stat=ierr)
-            call error_handler(ierr, 'allocating xdata%value in compute_bv_sl')
-            xdata(iv,ie,ic)%value(:,:,:) = field(:,:,:)
-         end do read_loop
-         close(iunit)
-      end do
-   end do
+         if ( trim(varnames(iv)) == 'T_U' ) then
+            if ( .not. allocated(temp) )   allocate( temp(1:ni,1:nj,1:nk) )
+            if ( .not. allocated(covar3) ) allocate( covar3(1:nk,1:nk,1:num_bins2d) )
+            covar3(:,:,:) = 0.0
+         end if
+      end if
 
-   end if ! pert1_read_opt=1, store all pert1 data in memory
+      write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing psi covariances for variable ', trim(varnames(iv))
 
-   if ( do_eof_transform ) then
+      do ic = 1, ncase
+         do ie = istart_member, iend_member
+            !write(stdout,'(5a,i4)')'    Processing data for date ', filedates(ie,ic), ', variable ', trim(varnames(iv)), &
+            !                  ', member ', ie
 
-      if ( myproc == root ) write(stdout,'(4a)')'====== Computing vertical error eigenvalues, eigenvectors ======'
+            ! psi is needed for all variables
+            write(ce,'(i3.3)') ie
+            input_file = 'psi.'//filedates(ie,ic)//'.e'//trim(ce)
+            call get_pert1_data(trim(input_file), 'PSI', ni, nj, nk, psi)
 
-      allocate( bv(1:nk,1:nk,1:num_bins2d) )
-      allocate( bin_pts2d(1:num_bins2d) )
-      allocate( work(1:nk,1:nk) )
-      allocate( e_vec_loc(1:nk,1:nk,1:num_bins2d) )
-      allocate( e_val_loc(1:nk,1:num_bins2d) )
-      allocate( e_vec(1:nk,1:nk) )
-      allocate( e_val(1:nk) )
+            input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+            if ( trim(varnames(iv)) == 'CHI_U' ) then
+               call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, chi)
+               ! Calculate psi/chi covariances:
+               do k = 1, nk
+                  do j = 1, nj
+                     do i = 1, ni
+                        b = bin(i,j,k)
+                        bin_pts(b) = bin_pts(b) + 1
+                        coeffa = 1.0 / real(bin_pts(b))
+                        coeffb = real(bin_pts(b)-1) * coeffa
+                        covar1(b) = coeffb * covar1(b) + coeffa * psi(i,j,k) * chi(i,j,k)
+                        var1(b) = coeffb * var1(b) + coeffa * psi(i,j,k) * psi(i,j,k)
+                     end do
+                  end do
+               end do
+            end if ! chi
+
+            if ( trim(varnames(iv)) == 'PSFC_U' .or. trim(varnames(iv)) == 'T_U' ) then
+
+               if ( trim(varnames(iv)) == 'PSFC_U' ) then
+                  call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, ps)
+               end if ! ps
+               if ( trim(varnames(iv)) == 'T_U' ) then
+                  call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, temp)
+               end if ! t
+
+               ! Calculate psi/ps, and psi/T, and psi/psi covariances:
+               do j = 1, nj
+                  do i = 1, ni
+                     b = bin2d(i,j)
+                     bin_pts2d(b) = bin_pts2d(b) + 1
+                     coeffa = 1.0 / real(bin_pts2d(b))
+                     coeffb = real(bin_pts2d(b)-1) * coeffa
+                     do k = 1, nk
+
+                        if ( trim(varnames(iv)) == 'PSFC_U' ) then
+                           ! psi/ps:
+                           covar2(k,b) = coeffb * covar2(k,b) + coeffa * ps(i,j,1) * psi(i,j,k)
+                        end if
+
+                        if ( trim(varnames(iv)) == 'T_U' ) then
+                           ! psi/T:
+                           do k2 = 1, nk
+                              covar3(k,k2,b) = coeffb * covar3(k,k2,b) + &
+                                               coeffa * temp(i,j,k) * psi(i,j,k2)
+                           end do
+                        end if
+
+                        ! psi/psi (symmetric):
+                        do k2 = 1, k
+                           var2(k,k2,b) = coeffb * var2(k,k2,b) + &
+                                          coeffa * psi(i,j,k) * psi(i,j,k2)
+                        end do
+
+                     end do ! k loop
+                  end do ! i loop
+               end do ! j loop
+
+            end if ! ps_u or t_u
+
+         end do ! ens member loop
+
+      end do ! ncase loop
+
+      got_var2_inv = .false. ! initialize
+
+      if ( trim(varnames(iv)) == 'PSFC_U' .or. trim(varnames(iv)) == 'T_U' ) then
+
+         ! Fill in psi/psi covariance by symmetry:
+         do b = 1, num_bins2d
+            do k = 1, nk
+               do k2 = k+1, nk ! Symmetry.
+                  var2(k,k2,b) = var2(k2,k,b)
+               end do
+            end do
+         end do
+
+         if ( .not. got_var2_inv ) then
+
+            write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing eigenvectors, eigenvalues and inverse for psi/psi covariance'
+
+            if ( .not. allocated(work) )     allocate( work(1:nk,1:nk) )
+            if ( .not. allocated(evec) )     allocate( evec(1:nk,1:nk) )
+            if ( .not. allocated(eval) )     allocate( eval(1:nk) )
+            if ( .not. allocated(LamInvET) ) allocate( LamInvET(1:nk,1:nk) )
+            if ( .not. allocated(var2_inv) ) allocate( var2_inv(1:nk,1:nk,1:num_bins2d) )
+
+            do b = 1, num_bins2d
+               LamInvET(:,:) = 0.0
+               work(1:nk,1:nk) = var2(1:nk,1:nk,b)
+               call da_eof_decomposition( nk, work, evec, eval )
+
+               ! Truncate eigenvalues to ensure inverse is not dominated by rounding error:
+               summ = 0.0
+               do m = 1, nk
+                  summ = summ + eval(m)
+               end do
+               total_variance = summ
+
+               cumul_variance = 0.0
+               mmax = nk
+               do m = 1, nk
+                  cumul_variance = cumul_variance + eval(m) / total_variance
+                  if ( cumul_variance > 1.0 - variance_threshold ) then
+                     mmax = m - 1
+                     exit
+                  end if
+               end do
+               !write(6,'(2(a,i6),2(a,1pe11.5))') ' Bin = ', b, ', <psipsi> truncation = ', mmax, &
+               !                                  ', Total Variance = ', total_variance, &
+               !                                  ', Condition number = ', eval(1) / eval(nk-1)
+
+               ! Lam{-1} . E^T:
+               do k = 1, nk
+                  do m = 1, mmax
+                     LamInvET(m,k) = evec(k,m) / eval(m)
+                  end do
+               end do
+
+               ! <psi psi>^{-1} = E . Lam{-1} . E^T:
+               do k = 1, nk
+                  do k2 = 1, k
+                     summ = 0.0
+                     do m = 1, nk
+                        summ = summ + evec(k,m) * LamInvET(m,k2)
+                     end do
+                     var2_inv(k,k2,b) = summ
+                  end do
+               end do
+
+               do k = 1, nk
+                  do k2 = k+1, nk ! Symmetry.
+                     var2_inv(k,k2,b) = var2_inv(k2,k,b)
+                  end do
+               end do
+            end do
+            got_var2_inv = .true.
+         end if ! got_var2_inv
+      end if ! ps_u or t_u
+
+      write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing regression coefficients for variable ', trim(varnames(iv))
+
+      if ( trim(varnames(iv)) == 'CHI_U' ) then
+         allocate( regcoeff1(1:num_bins) )
+         ! psi/chi:
+         do b = 1, num_bins
+            regcoeff1(b) = covar1(b) / var1(b)
+         end do
+         ! Output regression coefficients
+         output_file = 'regcoeff1.dat'
+         open (ounit, file=trim(output_file), form='unformatted', status='unknown')
+         write(ounit)ni, nj, nk
+         write(ounit)num_bins, num_bins2d
+         write(ounit)regcoeff1
+         close(ounit)
+      end if ! chi_u
+
+      if ( trim(varnames(iv)) == 'PSFC_U' ) then
+         allocate( regcoeff2(1:nk,1:num_bins2d) )
+         ! psi/ps:
+         do b = 1, num_bins2d
+            do k = 1, nk
+               summ = 0.0
+               do k2 = 1, nk
+                  summ = summ + covar2(k2,b) * var2_inv(k2,k,b)
+               end do
+               regcoeff2(k,b) = summ
+            end do
+         end do
+         ! Output regression coefficients
+         output_file = 'regcoeff2.dat'
+         open (ounit, file=trim(output_file), form='unformatted', status='unknown')
+         write(ounit)ni, nj, nk
+         write(ounit)num_bins, num_bins2d
+         write(ounit)regcoeff2
+         close(ounit)
+      end if ! ps_u
+
+      if ( trim(varnames(iv)) == 'T_U' ) then
+         allocate( regcoeff3(1:nk,1:nk,1:num_bins2d) )
+         ! psi/T:
+         do b = 1, num_bins2d
+            do k = 1, nk
+               do k2 = 1, nk
+                  summ = 0.0
+                  do k3 = 1, nk
+                     summ = summ + covar3(k,k3,b) * var2_inv(k3,k2,b)
+                  end do
+                  regcoeff3(k,k2,b) = summ
+               end do
+            end do
+         end do
+         ! Output regression coefficients
+         output_file = 'regcoeff3.dat'
+         open (ounit, file=trim(output_file), form='unformatted', status='unknown')
+         write(ounit)ni, nj, nk
+         write(ounit)num_bins, num_bins2d
+         write(ounit)regcoeff3
+         close(ounit)
+      end if ! t_u
+
+      if ( trim(varnames(iv)) == 'PSFC_U' .or. trim(varnames(iv)) == 'T_U' ) then
+         if ( allocated(var2_inv) ) deallocate( var2_inv )
+         if ( allocated(LamInvET) ) deallocate( LamInvET )
+         if ( allocated(eval) )     deallocate( eval )
+         if ( allocated(evec) )     deallocate( evec )
+         if ( allocated(work) )     deallocate( work )
+         if ( allocated(var2) )     deallocate( var2 )
+      end if
+
+      if ( trim(varnames(iv)) == 'CHI_U' ) then
+         deallocate( covar1 )
+         deallocate( var1 )
+      end if
+      if ( trim(varnames(iv)) == 'PSFC_U' ) then
+         deallocate( covar2 )
+      end if
+      if ( trim(varnames(iv)) == 'T_U' ) then
+         deallocate( covar3 )
+      end if
+
+      write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing unbalanced field for variable ', trim(varnames(iv))
+
+      do ic = 1, ncase
+         do ie = istart_member, iend_member
+
+            ! psi is needed for all variables
+            write(ce,'(i3.3)') ie
+            input_file = 'psi.'//filedates(ie,ic)//'.e'//trim(ce)
+            call get_pert1_data(trim(input_file), 'PSI', ni, nj, nk, psi)
+
+            input_file  = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+            output_file = 'pert1_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
+
+            if ( trim(varnames(iv)) == 'CHI_U' ) then
+               call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, chi)
+               do k = 1, nk
+                  do j = 1, nj
+                     do i = 1, ni
+                        b = bin(i,j,k)
+                        chi(i,j,k) = chi(i,j,k) - regcoeff1(b) * psi(i,j,k)
+                     end do
+                  end do
+               end do
+               open (ounit, file=output_file, form='unformatted')
+               write(ounit) 1, ni, nj, nk
+               write(ounit) 3
+               write(ounit) filedates(ie,ic), varnames(iv)
+               write(ounit) chi(:,:,:)
+               close(ounit)
+            end if ! chi_u
+
+            if ( trim(varnames(iv)) == 'PSFC_U' ) then
+               call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, ps)
+               do j = 1, nj
+                  do i = 1, ni
+                     b = bin2d(i,j)
+                     ps(i,j,1) = ps(i,j,1) - SUM(regcoeff2(1:nk,b) * psi(i,j,1:nk))
+                  end do
+               end do
+               open (ounit, file=output_file, form='unformatted')
+               write(ounit) 1, ni, nj, nk
+               write(ounit) 2
+               write(ounit) filedates(ie,ic), varnames(iv)
+               write(ounit) ps(:,:,1)
+               close(ounit)
+            end if ! ps_u
+
+            if ( trim(varnames(iv)) == 'T_U' ) then
+               call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, temp)
+               do j = 1, nj
+                  do i = 1, ni
+                     b = bin2d(i,j)
+                     do k = 1, nk
+                        temp(i,j,k) = temp(i,j,k) - SUM(regcoeff3(k,1:nk,b) * psi(i,j,1:nk))
+                     end do
+                  end do
+               end do
+               open (ounit, file=output_file, form='unformatted')
+               write(ounit) 1, ni, nj, nk
+               write(ounit) 3
+               write(ounit) filedates(ie,ic), varnames(iv)
+               write(ounit) temp(:,:,:)
+               close(ounit)
+            end if ! t_u
+
+         end do ! ens member loop
+      end do ! ncase loop
+
+      if ( trim(varnames(iv)) == 'CHI_U' ) then
+         if ( allocated(chi) ) deallocate( chi )
+      end if
+      if ( trim(varnames(iv)) == 'PSFC_U' ) then
+         if ( allocated(ps) ) deallocate( ps )
+      end if
+      if ( trim(varnames(iv)) == 'T_U' ) then
+         if ( allocated(temp) ) deallocate( temp )
+      end if
+
+   end do var_loop
+
+   deallocate( psi )
+   deallocate( bin_pts )
+   deallocate( bin_pts2d )
+
+end subroutine compute_regcoeff_unbalanced
+
+subroutine compute_bv_sl
+
+   !use da_lapack, only : dsyev
+   implicit none
+
+   character(len=filename_len)  :: filename                   ! Input filename.
+   character(len=filename_len)  :: output_file
+   integer             :: i, j, k, k1, k2, b         ! Loop counters.
+   integer             :: ic, ie, iv, m
+   integer :: istart_member, iend_member
+   real                :: inv_nij                    ! 1 / (ni*nj).
+   real                :: mean_field                 ! Mean field.
+   real                :: coeffa, coeffb             ! Accumulating mean coefficients.
+   logical             :: testing_eofs               ! True if testing EOF decomposition.
+   logical             :: use_global_eofs            ! True if projected data uses global EOFs.
+
+   integer, allocatable:: bin_pts2d(:)               ! Number of points in bin (2D fields).
+   real, allocatable   :: field(:,:,:)               ! Input field.
+   real, allocatable   :: fieldv(:,:,:)              ! projected field
+   real, allocatable   :: field2d(:,:)               ! Input field.
+   real, allocatable   :: bv(:,:,:)                  ! Vertical BE for this time.
+   real, allocatable   :: work(:,:)                  ! EOF work array.
+   real, allocatable   :: e_vec_loc(:,:,:)           ! Latitudinally varying eigenvectors.
+   real, allocatable   :: e_val_loc(:,:)             ! Latitudinally varying eigenvalues.
+   real*8, allocatable :: e_vec(:,:)                 ! Domain-averaged eigenvectors.
+   real*8, allocatable :: e_val(:)                   ! Domain-averaged eigenvalues.
+   real*8, allocatable :: evec(:,:,:)                ! Gridpoint eigenvectors.
+   real*8, allocatable :: eval(:,:)                  ! Gridpoint sqrt(eigenvalues).
+   real :: ETVp
+   character(len=DateStrLen)       :: DateStr_tmp
+   character(len=VarNameLen)       :: var_tmp
+   character(len=filename_len)     :: input_file
+   character(len=3)     :: ce
+
+   integer :: ndim
+   integer :: ijk
+   integer :: proc_send
+   integer :: nn
+   integer :: nvar_read, ni_read, nj_read, nk_read, nkk
+   integer :: k_start, k_end
+   integer, allocatable:: nr(:), icount(:)
+   real, allocatable   :: cov(:,:)
+   real, allocatable   :: ml(:), sl(:)
+   real, allocatable   :: sl_g(:)
+   real, allocatable   :: hl(:), hlt(:)
+
+   integer :: bv_unit = 22
+   integer :: iunit = 23
+   integer :: ounit = 24
+   testing_eofs = .false.
+   use_global_eofs = .true.
+
+   inv_nij = 1.0 / real(ni*nj)
+   allocate( field(1:ni,1:nj,1:nk) )
+
+   if ( trim(be_method) == 'NMC' ) then
+      istart_member = 1
+      iend_member   = 1
+   else if ( trim(be_method) == 'ENS' ) then
+      istart_member = 1
+      iend_member   = nens
+   end if
+
+   if ( pert1_read_opt == 1 ) then
+
+   allocate(xdata(1:nvar,istart_member:iend_member,1:ncase), stat=ierr)
+   call error_handler(ierr, 'allocating xdata in compute_bv_sl')
+
+   do ic = 1, ncase
+      do ie = istart_member, iend_member
+         write(ce,'(i3.3)') ie
+         input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+         inquire(file=trim(input_file), exist=isfile)
+         if ( .not. isfile ) then
+            call error_handler(-1, trim(input_file)//' not found for getting perturbation')
+         end if
+         !write(stdout,'(a,i3,a,a)') ' Proc', myproc, ' Reading from ', trim(input_file)
+         open (iunit, file=trim(input_file), form='unformatted', status='old')
+         read(iunit) nvar_read, ni_read, nj_read, nk_read
+         read_loop: do iv = 1, nvar_read
+            read(iunit) ndim
+            read(iunit) DateStr_tmp, var_tmp
+            if ( ndim == 2 ) then
+               read(iunit) field(:,:,1)
+            else if ( ndim == 3 ) then
+               read(iunit) field(:,:,:)
+            end if
+            allocate(xdata(iv,ie,ic)%value(ni,nj,nk), stat=ierr)
+            call error_handler(ierr, 'allocating xdata%value in compute_bv_sl')
+            xdata(iv,ie,ic)%value(:,:,:) = field(:,:,:)
+         end do read_loop
+         close(iunit)
+      end do
+   end do
+
+   end if ! pert1_read_opt=1, store all pert1 data in memory
+
+   if ( do_eof_transform ) then
+
+      if ( myproc == root ) write(stdout,'(4a)')' ====== Computing vertical error eigenvalues, eigenvectors ======'
+
+      allocate( bv(1:nk,1:nk,1:num_bins2d) )
+      allocate( bin_pts2d(1:num_bins2d) )
+      allocate( work(1:nk,1:nk) )
+      allocate( e_vec_loc(1:nk,1:nk,1:num_bins2d) )
+      allocate( e_val_loc(1:nk,1:num_bins2d) )
+      allocate( e_vec(1:nk,1:nk) )
+      allocate( e_val(1:nk) )
       allocate( evec(1:nj,1:nk,1:nk) )
       allocate( eval(1:nj,1:nk) )
 
@@ -1799,7 +2576,7 @@ subroutine compute_bv_sl
 
       var_loop: do iv = 1, nvar
 
-         if ( .not. read_it(iv) ) cycle var_loop
+         if ( .not. do_this_var(iv) ) cycle var_loop
          if ( myproc /= MOD((iv-1), num_procs) ) cycle var_loop
 
          write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Processing vertical error stats for variable ', trim(varnames(iv))
@@ -1825,7 +2602,13 @@ subroutine compute_bv_sl
                else if ( pert1_read_opt == 2 ) then
 
                   write(ce,'(i3.3)') ie
-                  input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                  if ( trim(varnames(iv)) == 'CHI_U' .or. &
+                       trim(varnames(iv)) == 'T_U'   .or. &
+                       trim(varnames(iv)) == 'PSFC_U'    ) then
+                     input_file = 'pert1_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
+                  else
+                     input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                  end if
 
                   call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, field)
 
@@ -1951,7 +2734,13 @@ subroutine compute_bv_sl
                   xdata(iv,ie,ic)%value(:,:,:) = field(:,:,:)
                 else if ( pert1_read_opt == 2 ) then
                   write(ce,'(i3.3)') ie
-                  input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                  if ( trim(varnames(iv)) == 'CHI_U' .or. &
+                       trim(varnames(iv)) == 'T_U'   .or. &
+                       trim(varnames(iv)) == 'PSFC_U'    ) then
+                     input_file = 'pert1_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
+                  else
+                     input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                  end if
                   call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, field)
                   do m = 1, nk
                      do j = 1, nj
@@ -1979,7 +2768,7 @@ subroutine compute_bv_sl
 #ifdef DM_PARALLEL
       ijk = ni*nj*nk
       do iv = 1, nvar
-         if ( .not. read_it(iv) ) cycle
+         if ( .not. do_this_var(iv) ) cycle
          proc_send = MOD((iv-1), num_procs)
          do ic = 1, ncase
             do ie = istart_member, iend_member
@@ -2029,7 +2818,7 @@ subroutine compute_bv_sl
          allocate(ml(nk))
 
          do iv = 1, nvar
-            if ( .not. read_it(iv) ) cycle
+            if ( .not. do_this_var(iv) ) cycle
             sl(:) = 0.0
             ml(:) = 0.0
             if ( var_dim(iv) == 2 ) then
@@ -2061,7 +2850,13 @@ subroutine compute_bv_sl
                         if ( do_eof_transform .and. (var_dim(iv) == 3) ) then
                            input_file = 'pertv_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
                         else
-                           input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                           if ( trim(varnames(iv)) == 'CHI_U' .or. &
+                                trim(varnames(iv)) == 'T_U'   .or. &
+                                trim(varnames(iv)) == 'PSFC_U'    ) then
+                              input_file = 'pert1_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
+                           else
+                              input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                           end if ! unbalanced variables
                         end if
                         call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, field)
                         field2d(:,:) = field(:,:,k)
@@ -2095,7 +2890,7 @@ subroutine compute_bv_sl
          allocate( hl(1:num_bins2d) )
          allocate( hlt(1:num_bins2d) )
          do iv = 1, nvar
-            if ( .not. read_it(iv) ) cycle
+            if ( .not. do_this_var(iv) ) cycle
             sl(:) = 0.0
             if ( var_dim(iv) == 2 ) then
                k_start = 1
@@ -2119,7 +2914,13 @@ subroutine compute_bv_sl
                         if ( do_eof_transform .and. (var_dim(iv) == 3) ) then
                            input_file = 'pertv_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
                         else
-                           input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                           if ( trim(varnames(iv)) == 'CHI_U' .or. &
+                                trim(varnames(iv)) == 'T_U'   .or. &
+                                trim(varnames(iv)) == 'PSFC_U'    ) then
+                              input_file = 'pert1_'//trim(varnames(iv))//'.'//filedates(ie,ic)//'.e'//trim(ce)
+                           else
+                              input_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
+                           end if ! unbalanced
                         end if
                         call get_pert1_data(trim(input_file), trim(varnames(iv)), ni, nj, nk, field)
                         field2d(:,:) = field(:,:,k)
@@ -2714,6 +3515,8 @@ subroutine get_pert1_data(pert_file, vname, nx, ny, nz, vdata)
    character(len=DateStrLen) :: DateStr_tmp
    character(len=VarNameLen) :: var_tmp
 
+   iunit = 21
+
    !pert_file = 'pert1.'//filedates(ie,ic)//'.e'//trim(ce)
    inquire(file=trim(pert_file), exist=isfile)
    if ( .not. isfile ) then
@@ -3068,3 +3871,1797 @@ subroutine dsyev(JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
 end subroutine dsyev
 #endif
 
+subroutine da_uv_to_div_c( dim1, dim2, dim3, ds, &
+                           mapfac_m, mapfac_u, mapfac_v, &
+                           u, v, div )
+
+!------------------------------------------------------------------------------
+!  PURPOSE: Calculate divergence on a co-ordinate surface, given an input
+!           wind field on an Arakawa C-grid.
+!
+!  NOTE: No boundary conditions required on the WRF Arakawa C-grid as
+!        divergence (mass) points are all within the outer u/v pts.
+!
+!
+!  HISTORY: 08/09/2005 - Creation of F90 version.           Dale Barker
+!                        d   U      d   V
+!           Div = m^2 *[---(---) + ---(---) ]
+!                        dx  m      dy  M
+!------------------------------------------------------------------------------
+
+   implicit none
+
+   integer, intent(in):: dim1, dim2, dim3             ! Dimensions
+   real, intent(in)   :: ds                           ! Resolution
+   real, intent(in)   :: mapfac_m(1:dim1,1:dim2)      ! Map factor - mass pts
+   real, intent(in)   :: mapfac_u(1:dim1+1,1:dim2)    ! Map factor - u points
+   real, intent(in)   :: mapfac_v(1:dim1,1:dim2+1)    ! Map factor - u points
+   real, intent(in)   :: u(1:dim1+1,1:dim2,1:dim3)    ! v wind
+   real, intent(in)   :: v(1:dim1,1:dim2+1,1:dim3)    ! v wind
+   real, intent(out)  :: div(1:dim1,1:dim2,1:dim3)    ! Divergence
+
+   integer            :: i, j, k                      ! Loop counters
+   real               :: ds_inv                       ! 1/ds
+   real               :: coeff(1:dim1,1:dim2)         ! Coefficient
+   real               :: um(1:dim1+1,1:dim2,1:dim3)   ! u-wind copy
+   real               :: vm(1:dim1,1:dim2+1,1:dim3)   ! v-wind copy
+
+!------------------------------------------------------------------------------
+!  [1.0] Initialise:
+!------------------------------------------------------------------------------
+
+   div(:,:,:) = 0.0
+
+   ds_inv = 1.0 / ds
+   coeff(:,:) = ds_inv
+
+!------------------------------------------------------------------------------
+!  [2] Calculate scaled u/v field:
+!------------------------------------------------------------------------------
+
+   do k = 1, dim3
+      do j = 1, dim2
+         do i = 1, dim1+1
+            um(i,j,k) = u(i,j,k) / mapfac_u(i,j)
+         end do
+      end do
+   end do
+
+   do k = 1, dim3
+      do j = 1, dim2+1
+         do i = 1, dim1
+            if (mapfac_v(i,j) > 0.000001) then
+               vm(i,j,k) = v(i,j,k) / mapfac_v(i,j)
+            else
+               vm(i,j,k)=0.0
+            end if
+         end do
+      end do
+   end do
+
+!------------------------------------------------------------------------------
+!  [3] Calculate divergence field:
+!------------------------------------------------------------------------------
+
+   do k = 1, dim3
+      do j = 1, dim2
+         do i = 1, dim1
+            div(i,j,k) = coeff(i,j) * ( um(i+1,j,k) - um(i,j,k) + vm(i,j+1,k) - vm(i,j,k) )
+         end do
+      end do
+   end do
+
+end subroutine da_uv_to_div_c
+
+subroutine da_uv_to_vor_c( dim1, dim2, dim3, ds, &
+                           mapfac_m, mapfac_u, mapfac_v, &
+                           u, v, vor )
+
+!------------------------------------------------------------------------------
+!  PURPOSE: Calculate vorticity on a co-ordinate surface, given an input
+!           wind field on an Arakawa C-grid.
+!
+!  NOTE: Zero vorticity boundary conditions.
+!
+!  HISTORY: 08/09/2005 - Creation of F90 version.           Dale Barker
+!                        d   V      d   U
+!           Vor = m^2 *[---(---) - ---(---) ]
+!                        dx  m      dy  M
+!------------------------------------------------------------------------------
+
+   implicit none
+
+   integer, intent(in):: dim1, dim2, dim3             ! Dimensions
+   real, intent(in)   :: ds                           ! Resolution
+   real, intent(in)   :: mapfac_m(1:dim1,1:dim2)      ! Map factor - mass pts
+   real, intent(in)   :: mapfac_u(1:dim1+1,1:dim2)    ! Map factor - u points
+   real, intent(in)   :: mapfac_v(1:dim1,1:dim2+1)    ! Map factor - u points
+   real, intent(in)   :: u(1:dim1+1,1:dim2,1:dim3)    ! v wind
+   real, intent(in)   :: v(1:dim1,1:dim2+1,1:dim3)    ! v wind
+   real, intent(out)  :: vor(1:dim1+1,1:dim2+1,1:dim3)! Vorticity
+
+   integer            :: i, j, k                      ! Loop counters
+   real               :: ds_inv                       ! 1/ds
+   real               :: coeff(1:dim1,1:dim2)         ! Coefficient
+   real               :: um(1:dim1+1,1:dim2,1:dim3)   ! u-wind copy
+   real               :: vm(1:dim1,1:dim2+1,1:dim3)   ! v-wind copy
+
+!------------------------------------------------------------------------------
+!  [1.0] Initialise:
+!------------------------------------------------------------------------------
+
+   vor(:,:,:) = 0.0
+
+   ds_inv = 1.0 / ds
+   coeff(:,:) = ds_inv
+
+!------------------------------------------------------------------------------
+!  [2] Calculate scaled u/v field:
+!------------------------------------------------------------------------------
+
+   do k = 1, dim3
+      do j = 1, dim2
+         do i = 1, dim1+1
+            um(i,j,k) = u(i,j,k) / mapfac_u(i,j)
+         end do
+      end do
+   end do
+
+   do k = 1, dim3
+      do j = 1, dim2+1
+         do i = 1, dim1
+            if (mapfac_v(i,j) > 0.000001) then
+               vm(i,j,k) = v(i,j,k) / mapfac_v(i,j)
+            else
+               vm(i,j,k)=0.0
+            end if
+         end do
+      end do
+   end do
+
+!------------------------------------------------------------------------------
+!  [4] Calculate vorticity field:
+!------------------------------------------------------------------------------
+
+   do k = 1, dim3
+      do j = 2, dim2
+         do i = 2, dim1
+            vor(i,j,k) = coeff(i,j) * ( vm(i,j,k) - vm(i-1,j,k) - um(i,j,k) + um(i,j-1,k) )
+         end do
+      end do
+   end do
+
+!  Boundary values (extrapolation):
+!  Note - not used in Del**2 calculation if overwritten with bcs there).
+!   vor(1,1:dim2+1)      = 2.0 * vor(2,1:dim2+1) - vor(3,1:dim2+1)         ! West
+!   vor(dim1+1,1:dim2+1) = 2.0 * vor(dim1,1:dim2+1) - vor(dim1-1,1:dim2+1) ! East
+!   vor(1:dim1+1,1)      = 2.0 * vor(1:dim1+1,2) - vor(1:dim1+1,3)         ! South
+!   vor(1:dim1+1,dim2+1) = 2.0 * vor(1:dim1+1,dim2) - vor(1:dim1+1,dim2-1) ! North
+
+!  Boundary values (zero gradient):
+!  Note - not used in Del**2 calculation if overwritten with bcs there).
+   vor(1,2:dim2,:)        = vor(2,2:dim2,:)      ! West
+   vor(dim1+1,2:dim2,:)   = vor(dim1,2:dim2,:)   ! East
+   vor(1:dim1+1,1,:)      = vor(1:dim1+1,2,:)    ! South
+   vor(1:dim1+1,dim2+1,:) = vor(1:dim1+1,dim2,:) ! North
+
+end subroutine da_uv_to_vor_c
+
+subroutine da_del2a_to_a( dim1, dim2, dim3, ds, del2a, a )
+! adapted from var/gen_be/gen_be_stage0_wrf.f90
+
+   implicit none
+
+   integer, intent(in):: dim1, dim2, dim3
+   real, intent(in)   :: ds                             ! grid distance
+   real, intent(in)   :: del2a(1:dim1,1:dim2,1:dim3)    ! Del**2 a
+   real, intent(out)  :: a(1:dim1,1:dim2,1:dim3)        ! Field a
+
+   integer, parameter :: num_fft_factors = 10
+   integer, parameter :: nrange = 1000
+   real,    parameter :: pi = 3.1415926
+
+   integer            :: n1, n2                         ! Padded dimensions (n=dim-1+pad)
+   integer            :: fft_method                     ! 1=Cosine, 2=Sine transform
+   integer            :: ifax1(1:num_fft_factors)       ! FFT factors
+   integer            :: ifax2(1:num_fft_factors)       ! FFT factors
+   real, allocatable  :: trigs1(:)                      ! FFT trig functions
+   real, allocatable  :: trigs2(:)                      ! FFT trig functions
+   real, allocatable  :: fft_coeffs(:,:)                ! FFT coefficients
+
+   integer            :: fft_pad1, fft_pad2             ! Range to search for efficient FFT
+   logical            :: found_magic                    ! True if 2**p 3**p 5**r dimension found
+   integer            :: fft_factors(1:num_fft_factors) ! FFT factors
+   real               :: const                          ! Multiplicative constant
+   real               :: coeff_nx                       ! Multiplicative constant
+   real               :: coeff_ny                       ! Multiplicative constant
+   real               :: cos_coeff_nx                   ! Multiplicative constant
+   real               :: cos_coeff_ny                   ! Multiplicative constant
+
+   integer            :: i, j, k                        ! Loop counters
+   integer            :: ij                             ! 1D array counter
+   integer            :: isign                          ! -1=Grid>spec, 1=Spec>Grid
+   integer            :: inc                            ! Stride between data points
+   integer            :: jump                           ! Increment between start of data vectors
+   integer            :: lot                            ! Number of data vectors
+   integer            :: n                              ! n+1 is the length of the data
+   integer            :: work_area                      ! Dimension of workspace
+   !real               :: a2d(1:n1+1,1:n2+1)             ! 2D data array
+   !real               :: a1d(1:(n1+1)*(n2+1))           ! 1D data array
+   real, allocatable  :: a2d(:,:)                       ! 2D data array
+   real, allocatable  :: a1d(:)                         ! 1D data array
+
+!  Ensure efficient FFT dimensions by padding if necessary:
+   n1 = dim1 - 1
+   do n = n1, n1 + nrange
+      call da_find_fft_factors( n, found_magic, fft_factors )
+      if ( found_magic .and. mod(n,2) == 0 ) then ! Even magic number found.
+         fft_pad1 = n - n1
+         ifax1 = fft_factors
+         exit
+      end if
+   end do
+   n1 = n1 + fft_pad1
+
+   n2 = dim2 - 1
+   do n = n2, n2 + nrange
+      call da_find_fft_factors( n, found_magic, fft_factors )
+      if ( found_magic .and. mod(n,2) == 0 ) then ! Even magic number found.
+         fft_pad2 = n - n2
+         ifax2 = fft_factors
+         exit
+      end if
+   end do
+   n2 = n2 + fft_pad2
+
+   allocate( trigs1(1:3*n1) )
+   allocate( trigs2(1:3*n2) )
+   allocate( fft_coeffs(1:n1+1,1:n2+1) )
+
+   const = -0.5 * ds * ds
+   coeff_nx = pi / real(n1)
+   coeff_ny = pi / real(n2)
+
+!  Calculate spectral Del**2 coefficients for C-grid (all pts. except i=j=1):
+   fft_coeffs(1,1) = 0.0 ! Not used?
+   do j = 2, n2+1
+      cos_coeff_ny = cos(coeff_ny * real(j - 1))
+      do i = 1, n1+1
+         cos_coeff_nx = cos(coeff_nx * real(i - 1))
+         fft_coeffs(i,j) = const / ( 2.0 - cos_coeff_nx - cos_coeff_ny)
+      end do
+   end do
+   j = 1
+   cos_coeff_ny = cos(coeff_ny * real(j - 1))
+   do i = 2, n1+1
+      cos_coeff_nx = cos(coeff_nx * real(i - 1))
+      fft_coeffs(i,j) = const / ( 2.0 - cos_coeff_nx - cos_coeff_ny)
+   end do
+
+   call da_find_fft_trig_funcs( n1, trigs1 )
+   call da_find_fft_trig_funcs( n2, trigs2 )
+
+   fft_method = 2
+
+   work_area = ( n1 + 1 ) * ( n2 + 1 )
+
+   allocate ( a2d(1:n1+1,1:n2+1) )
+   allocate ( a1d(1:(n1+1)*(n2+1)) )
+
+do k = 1, dim3
+
+!  Fill 2D array structure
+   do j = 1, dim2
+      do i = 1, dim1
+         a2d(i,j) = del2a(i,j,k)
+      end do
+
+!     Fill pad zone (and force b.c.s to satisfy solution type):
+      if ( fft_method == 1 ) then ! Cosine transform.
+         a2d(1,j) = a2d(2,j)
+         do i = dim1, n1+1
+            a2d(i,j) = a2d(dim1-1,j)
+         end do
+      else if ( fft_method == 2 ) then ! Sine transform:
+         a2d(1,j) = 0.0
+         do i = dim1, n1+1
+            a2d(i,j) = 0.0
+         end do
+      end if
+   end do
+
+   if ( fft_method == 1 ) then ! Cosine transform.
+      do i = 1, n1+1
+         a2d(i,1) = a2d(i,2)
+         do j = dim2, n2+1
+            a2d(i,j) = a2d(i,dim2-1)
+         end do
+      end do
+   else if ( fft_method == 2 ) then ! Sine transform:
+      do i = 1, n1+1
+         a2d(i,1) = 0.0
+         do j = dim2, n2+1
+            a2d(i,j) = 0.0
+         end do
+      end do
+   end if
+
+!  Transfer to data array:
+   do j = 1, n2+1
+      do i = 1, n1+1
+         ij = (j-1) * (n1+1) + i
+         a1d(ij) = a2d(i,j)
+      end do
+   end do
+
+!------------------------------------------------------------------------------
+!     Perform double fast sine/cosine transform to get spectral del2a:
+!------------------------------------------------------------------------------
+
+   isign = -1 ! Grid to spectral
+
+!  1st dimension:
+   inc = 1    ! Stride between data points.
+   jump = n1+1! Increment between start of data vectors.
+   lot = n2+1 ! Number of data vectors.
+   n = n1     ! n+1 is the length of the data.
+   if ( fft_method == 1 ) then
+      call fft551( isign, inc, jump, lot, n, &
+                   ifax1, trigs1, a1d, work_area )
+   else if ( fft_method == 2 ) then
+      call fft661( isign, inc, jump, lot, n, &
+                   ifax1, trigs1, a1d, work_area )
+   end if
+
+!  2nd dimension:
+   inc = n1+1 ! Stride between data points.
+   jump = 1   ! Increment between start of data vectors.
+   lot = n1+1 ! Number of data vectors.
+   n = n2     ! n+1 is the length of the data.
+
+   if ( fft_method == 1 ) then
+      call fft551( isign, inc, jump, lot, n, &
+                   ifax2, trigs2, a1d, work_area )
+   else if ( fft_method == 2 ) then
+      call fft661( isign, inc, jump, lot, n, &
+                   ifax2, trigs2, a1d, work_area )
+   end if
+
+!------------------------------------------------------------------------------
+!  Perform conversion from del2a to a in spectral space:
+!------------------------------------------------------------------------------
+
+!  Note fft_coeffs(1,1)=0 so a(k=0,l=0) is also 0.
+   do j = 1, n2+1
+      do i = 1, n1+1
+         ij = (j-1) * (n1+1) + i
+         a1d(ij) = fft_coeffs(i,j) * a1d(ij)
+      end do
+   end do
+
+!------------------------------------------------------------------------------
+!  Perform double fast sine/cosine transform to get gridpoint a:
+!------------------------------------------------------------------------------
+
+   isign = 1 ! Spectral to grid.
+
+!  1st dimension:
+   inc = 1    ! Stride between data points.
+   jump = n1+1! Increment between start of data vectors.
+   lot = n2+1 ! Number of data vectors.
+   n = n1     ! n+1 is the length of the data.
+
+   if ( fft_method == 1 ) then
+      call fft551( isign, inc, jump, lot, n, &
+                   ifax1, trigs1, a1d, work_area )
+   else if ( fft_method == 2 ) then
+      call fft661( isign, inc, jump, lot, n, &
+                   ifax1, trigs1, a1d, work_area )
+   end if
+
+!  2nd dimension:
+   inc = n1+1 ! Stride between data points.
+   jump = 1   ! Increment between start of data vectors.
+   lot = n1+1 ! Number of data vectors.
+   n = n2     ! n+1 is the length of the data.
+
+   if ( fft_method == 1 ) then
+      call fft551( isign, inc, jump, lot, n, &
+                   ifax2, trigs2, a1d, work_area )
+   else if ( fft_method == 2 ) then
+      call fft661( isign, inc, jump, lot, n, &
+                   ifax2, trigs2, a1d, work_area )
+   end if
+
+!  Transfer grid-point chi to 2D-array (throwing away pad):
+   do j = 1, dim2
+      do i = 1, dim1
+         ij = (j-1) * (n1+1) + i
+         a(i,j,k) = a1d(ij)
+      end do
+   end do
+
+end do ! k loop
+
+   deallocate (a2d)
+   deallocate (a1d)
+   deallocate (fft_coeffs)
+   deallocate (trigs2)
+   deallocate (trigs1)
+
+contains
+
+subroutine da_find_fft_factors(n, n_ok, fft_factors)
+! taken from var/da/da_tools/da_find_fft_factors.inc
+
+   !---------------------------------------------------------------------------
+   ! Purpose: Calculates prime factors of input number.
+   !---------------------------------------------------------------------------
+
+   implicit none
+
+   integer, intent(in)  :: n
+   logical, intent(out) :: n_ok
+   integer, intent(out) :: fft_factors(:)
+
+   integer :: i, k, l
+   integer :: nfax, nu, ifac
+   integer :: jfax(num_fft_factors)
+   integer :: lfax(7)
+
+   data lfax /6,8,5,4,3,2,1/
+
+   ! in da_control
+   !if (trace_use) call da_trace_entry("da_find_fft_factors")
+
+   !---------------------------------------------------------------------------
+   ! [1.0] Find factors of vector size (8,6,5,4,3,2; only one 8 allowed):
+   !---------------------------------------------------------------------------
+
+   n_ok = .false.
+   fft_factors(:) = 0
+
+   ! look for sixes first, store factors in descending order
+   nu=n
+   ifac=6
+   k=0
+   l=1
+
+20 continue
+
+   if (mod(nu,ifac).ne.0) goto 30
+
+   ! 6 is a factor:
+   k=k+1
+   jfax(k)=ifac
+   if (ifac.ne.8) goto 25
+   if (k.eq.1) goto 25
+   jfax(1)=8
+   jfax(k)=6
+
+25 continue
+   nu=nu/ifac
+   if (nu.eq.1) goto 50
+   if (ifac.ne.8) goto 20
+
+30 continue
+   l=l+1
+   ifac=lfax(l)
+   if (ifac .gt. 1) goto 20
+
+   ! illegal factors:
+   ! write (unit=message(1),fmt='(a,i4,a)') 'n = ', n, ' contains illegal factors.'
+   ! call da_warning(__file__,__line__,message(1:1))
+
+   goto 9
+
+   ! now reverse order of factors
+50 continue
+   nfax=k
+   fft_factors(1)=nfax
+   do i=1,nfax
+      fft_factors(nfax+2-i)=jfax(i)
+   end do
+
+   n_ok = .true.
+
+9  continue
+
+   !if (trace_use) call da_trace_exit("da_find_fft_factors")
+
+end subroutine da_find_fft_factors
+
+subroutine da_find_fft_trig_funcs(n, trig_functs)
+! taken from var/da/da_tools/da_find_fft_trig_funcs.inc
+
+   !---------------------------------------------------------------------------
+   ! Purpose: Set up constants required for Fourier, sine and cosine transforms
+   !---------------------------------------------------------------------------
+
+   implicit none
+
+   integer, intent(in)  :: n
+   real,    intent(out) :: trig_functs(:)
+
+   integer :: k, nil, nhl
+   real    :: del, angle
+
+   ! in da_control
+   ! if (trace_use) call da_trace_entry("da_find_fft_trig_funcs")
+
+   !---------------------------------------------------------------------------
+   ! [1.0] Trig functions for real periodic transform:
+   !---------------------------------------------------------------------------
+
+   trig_functs(:) = 0.0
+
+   del=4.0*(pi/2.0)/float(n)
+   nil=0
+   nhl=(n/2)-1
+
+   do k=nil,nhl
+      angle=float(k)*del
+      trig_functs(2*k+1)=cos(angle)
+      trig_functs(2*k+2)=sin(angle)
+   end do
+
+   ! [1.1] extra trig functions for cosine transform:
+
+   del=0.5*del
+   do k=1,nhl
+      angle=float(k)*del
+      trig_functs(2*n+k)=sin(angle)
+   end do
+
+   ! [1.2] extra trig functions for shifted cosine transform:
+
+   del=0.5*del
+   do k=1,n
+      angle=float(k)*del
+      trig_functs(n+k)=sin(angle)
+   end do
+
+   !if (trace_use) call da_trace_exit("da_find_fft_trig_funcs")
+
+end subroutine da_find_fft_trig_funcs
+
+end subroutine da_del2a_to_a
+
+!DECK FFT551
+!     SUBROUTINE 'FFT551' - MULTIPLE FAST COSINE TRANSFORM
+!
+!     AUTHOR: CLIVE TEMPERTON, MAY 1988
+!     [ALL-FORTRAN VERSION: C.T., OCTOBER 1995]
+!
+!     COSINE TRANSFORM OF LENGTH N IS CONVERTED TO
+!     REAL PERIODIC TRANSFORM OF LENGTH N BY PRE- AND POST-
+!     PROCESSING. REAL PERIODIC TRANSFORM IS PERFORMED BY
+!     PRUNING REDUNDANT OPERATIONS FROM COMPLEX TRANSFORM.
+!
+!     SEE FOR EXAMPLE PAUL SWARZTRAUBER, "SYMMETRIC FFT'S",
+!     MATH. COMP. 47 (1986), 323-346.
+!
+!     A IS THE ARRAY CONTAINING INPUT & OUTPUT DATA
+!     WORK IS AN AREA OF SIZE (N+1)*MIN(LOT,64)
+!     TRIGS IS A PREVIOUSLY PREPARED LIST OF TRIG FUNCTION VALUES
+!     IFAX IS A PREVIOUSLY PREPARED LIST OF FACTORS OF N
+!     INC IS THE INCREMENT WITHIN EACH DATA 'VECTOR'
+!         (E.G. INC=1 FOR CONSECUTIVELY STORED DATA)
+!     JUMP IS THE INCREMENT BETWEEN THE START OF EACH DATA VECTOR
+!     N+1 IS THE LENGTH OF THE DATA VECTORS
+!        (WHICH INCLUDE NONZERO VALUES AT BOTH ENDPOINTS)
+!     LOT IS THE NUMBER OF DATA VECTORS
+!     ISIGN = +1 FOR TRANSFORM FROM SPECTRAL TO GRIDPOINT
+!           = -1 FOR TRANSFORM FROM GRIDPOINT TO SPECTRAL
+!
+!     ORDERING OF COEFFICIENTS:   Z(0) , Z(1) , Z(2) , ... , Z(N)
+!
+!     ORDERING OF DATA:           X(0) , X(1) , X(2) , ... , X(N)
+!
+!     VECTORIZATION IS ACHIEVED ON CRAY BY DOING THE TRANSFORMS
+!     IN PARALLEL
+!
+!     N MUST BE COMPOSED OF FACTORS 2,3 & 5 AND MUST BE EVEN
+!
+!     DEFINITION OF TRANSFORMS:
+!     -------------------------
+!
+!     ISIGN=+1: X(I)=SUM(J=0,...,N)(E(J)*Z(J)*COS(I*J*PI/N))
+!                    WHERE E(J)=0.5 FOR J=0,N --- ELSE E(J)=1
+!
+!     ISIGN=-1: Z(J)=(2/N)*SUM(I=0,...,N)(E(I)*X(I)*COS(I*J*PI/N))
+!
+! N.B.  FFT551 has an unusual definition of the FFTs,
+!       such that the the coeff of wave0 is NOT the mean.
+!
+!---------------------------------------------------------------------
+Subroutine FFT551 & ! in
+ ( ISIGN,                & ! in
+   INC,                  & ! in
+   JUMP,                 & ! in
+   LOT,                  & ! in
+   N,                    & ! in
+   IFAX,                 & ! in
+   TRIGS,                & ! in
+   A,                    & ! inout
+   IDIM )                   ! in
+
+! Code Description:  ORIGINAL CODE F77 IS HARDLY TOUCHED !!!
+
+ Integer , intent (in)    :: ISIGN         ! Switch forward (-1) or inverse (+1)
+ Integer , intent (in)    :: INC           ! increment within each data
+                                           ! vector  (e.g. INC=1 for 
+                                           ! consecutively stored data)
+ Integer , intent (in)    :: Jump          ! increment between start of
+                                           ! data vectors
+ Integer , intent (in)    :: LOT           ! Number of data vectors
+ Integer , intent (in)    :: N             ! N+1 is the length of the data 
+
+ Integer , intent (in)    :: IFAX(10)      ! previously prepared list of 
+                                           ! factors of N
+ 
+ Real    , intent (in)    :: TRIGS(3*N)    ! previously prepared list of 
+                                           ! trigonometric function values
+ Real    , intent (inout) :: A( INC*(N+1) + JUMP*(LOT-1) ) ! data array                                       !  vectors  (which include zeros 
+                                           ! at the endpoints)
+ Integer , intent (in)    :: IDIM           ! dimension workspace 
+
+ Real :: WORK(IDIM)                      ! size (n+1)*min(lot,VectorLength)
+ Integer                  :: NFAX,NX,NH
+ Integer                  :: NBLOX,NVEX,NB
+ Integer                  :: K, IC, J, LA, IGO, JA,JB,IA,IB
+ Integer                  :: IFAC,IERR,ISTART
+
+ Real                     :: CO,S, t1,t2,si,scale, vectorlength
+
+CHARACTER (LEN=*), PARAMETER :: RoutineName = "Var_FFT551"
+
+      VectorLength = LOT
+      NFAX=IFAX(1)
+      NX=N+1
+      NH=N/2
+      NBLOX=1+(LOT-1)/VectorLength
+      NVEX=LOT-(NBLOX-1)*VectorLength
+      ISTART=1
+!
+      DO 200 NB=1,NBLOX
+!
+!     PREPROCESSING
+!     -------------
+      IA=ISTART
+      IB=IA+NH*INC
+      IC=IA+N*INC
+      JA=1
+      JB=NH+1
+      IF (MOD(NFAX,2).EQ.1) THEN
+!DIR$ IVDEP
+         DO 105 J=1,NVEX
+         T1=0.5*(A(IA)+A(IC))
+         T2=0.5*(A(IA)-A(IC))
+         A(IA)=T1
+         A(IC)=T2
+         IA=IA+JUMP
+         IC=IC+JUMP
+  105    CONTINUE
+      ELSE  
+!DIR$ IVDEP
+         DO 110 J=1,NVEX
+         WORK(JA)=0.5*(A(IA)+A(IC))
+         WORK(JB)=A(IB)
+         A(IC)=0.5*(A(IA)-A(IC))
+         IA=IA+JUMP
+         IB=IB+JUMP
+         IC=IC+JUMP
+         JA=JA+NX
+         JB=JB+NX
+  110    CONTINUE
+      ENDIF
+!
+      DO 130 K=1,NH-1
+      JA=K+1
+      JB=N+1-K
+      IA=ISTART+K*INC
+      IB=ISTART+(JB-1)*INC
+      IC=ISTART+N*INC
+      SI=TRIGS(2*N+K)
+      CO=TRIGS(2*N+NH-K)
+      IF (MOD(NFAX,2).EQ.1) THEN
+!DIR$ IVDEP
+         DO 115 J=1,NVEX
+         T1 = 0.5*(A(IA)+A(IB)) - SI*(A(IA)-A(IB))
+         T2 = 0.5*(A(IA)+A(IB)) + SI*(A(IA)-A(IB))
+         A(IC) = A(IC) + CO*(A(IA)-A(IB))
+         A(IA) = T1
+         A(IB) = T2
+         IA=IA+JUMP
+         IB=IB+JUMP
+         IC=IC+JUMP
+  115    CONTINUE
+      ELSE
+!DIR$ IVDEP
+         DO 120 J=1,NVEX
+         WORK(JA) = 0.5*(A(IA)+A(IB)) - SI*(A(IA)-A(IB))
+         WORK(JB) = 0.5*(A(IA)+A(IB)) + SI*(A(IA)-A(IB))
+         A(IC) = A(IC) + CO*(A(IA)-A(IB))
+         IA=IA+JUMP
+         IB=IB+JUMP
+         IC=IC+JUMP
+         JA=JA+NX
+         JB=JB+NX
+  120    CONTINUE
+      ENDIF
+  130 CONTINUE
+!
+!     PERIODIC FOURIER ANALYSIS
+!     -------------------------
+      IA=ISTART
+      LA=N
+      IGO=1-2*MOD(NFAX,2)
+!
+      DO 140 K=1,NFAX
+      IFAC=IFAX(NFAX+2-K)
+      LA=LA/IFAC
+      IERR=-1
+      IF (IGO.EQ.+1) THEN
+        CALL qpassm(WORK,WORK(IFAC*LA+1),A(IA),A(IA+LA*INC), &
+                    TRIGS,1,INC,NX,JUMP,NVEX,N,IFAC,LA,IERR)
+      ELSE IF (IGO.EQ.-1) THEN
+        CALL qpassm(A(IA),A(IA+IFAC*LA*INC),WORK,WORK(LA+1), &
+                    TRIGS,INC,1,JUMP,NX,NVEX,N,IFAC,LA,IERR)
+      ENDIF
+      IF (IERR.NE.0) GO TO 500
+      IGO=-IGO
+  140 CONTINUE
+!
+!     POSTPROCESSING
+!     --------------
+      SCALE=2.0
+      IF (ISIGN.EQ.+1) SCALE = FLOAT(N)
+      S=1.0
+      IF (ISIGN.EQ.-1) S = 2.0/FLOAT(N)
+      JA=ISTART
+      JB=JA+N*INC
+      IA=1
+      IB=N
+!DIR$ IVDEP
+      DO 150 J=1,NVEX
+      A(JA)=SCALE*WORK(IA)
+      A(JA+INC)=S*A(JB)
+      A(JB)=SCALE*WORK(IB)
+      IA=IA+NX
+      IB=IB+NX
+      JA=JA+JUMP
+      JB=JB+JUMP
+  150 CONTINUE
+!
+      DO 170 K=2,N-2,2
+      JA=ISTART+K*INC
+      IA=K
+!DIR$ IVDEP
+      DO 160 J=1,NVEX
+      A(JA)=SCALE*WORK(IA)
+      A(JA+INC)=-SCALE*WORK(IA+1)+A(JA-INC)
+      IA=IA+NX
+      JA=JA+JUMP
+  160 CONTINUE
+  170 CONTINUE
+!
+      ISTART=ISTART+NVEX*JUMP
+      NVEX=VectorLength
+  200 CONTINUE
+      GO TO 570
+!
+!     ERROR MESSAGES
+!     --------------
+  500 CONTINUE
+      GO TO (510,530,550) IERR
+  510 CONTINUE
+      WRITE(UNIT=0,FMT='(A,I4,A)') &
+        'VECTOR LENGTH =',NVEX,', GREATER THAN VectorLength'
+      GO TO 570
+  530 CONTINUE
+      WRITE(UNIT=0,FMT='(A,I3,A)') &
+        'FACTOR =',IFAC,', NOT CATERED FOR'
+      GO TO 570
+  550 CONTINUE
+      WRITE(UNIT=0,FMT='(A,I3,A)') &
+        'FACTOR =',IFAC,', ONLY CATERED FOR IF LA*IFAC=N'
+  570 CONTINUE
+
+      RETURN
+      END SUBROUTINE FFT551
+
+
+Subroutine FFT661  & ! in
+ ( ISIGN,              & ! in
+   INC,                & ! in
+   JUMP,               & ! in
+   LOT,                & ! in
+   N,                  & ! in
+   IFAX,               & ! in
+   TRIGS,              & ! in
+   A,                  & ! inout
+   DIM )                 ! in
+!
+!
+! Description:
+!     MULTIPLE FAST SINE TRANSFORM
+!     (Originally called FFT661, then Var_SinTrans)
+!      author: clive temperton, may 1988 
+!       (slightly modified for all-fortran version)
+!
+!     Sine transform of length n is converted to
+!     Real periodic transform of length n by pre- and post-
+!     processing. Real periodic transform is performed by
+!     pruning redundant operations from complex transform.
+!
+!     see for example paul swarztrauber, "symmetric fft's",
+!     math. comp. 47 (1986), 323-346.
+!
+! Method:
+!
+!     ordering of coefficients:   z(0) , z(1) , z(2) , ... , z(n)
+!     ordering of data:           x(0) , x(1) , x(2) , ... , x(n)
+!
+!    vectorization is achieved on cray by doing the transforms
+!    in parallel
+!
+!    N must be composed of factors 2,3 & 5 and must be even
+!
+!    definition of transforms:
+!     -------------------------
+!
+!     isign=+1: x(i)=sum(j=1,...,n-1)(z(j)*sin(i*j*pi/n))
+!
+!     isign=-1: z(j)=(2/n)*sum(i=1,...,n-1)(x(i)*sin(i*j*pi/n))
+!
+! Current Code Owner: Andrew Lorenc
+!
+!   History:
+! Version   Date     Comment
+! -------   ----     -------
+! 0.1       14/12/93 Original code. Phil Andrews
+! 0.2       16/09/94 Small Modifications for the
+!                    incorporation in the VAR project. HB
+! 1.1       21/04/95 placed under control. JB
+! 1.2       01/06/95 Tracing added. JB
+!
+! Code Description:
+!    NB   BECAUSE OF THE TRICKY NESTED LOOPS 
+!         ORIGINAL CODE F77 IS HARDLY TOUCHED !!!
+
+Implicit none
+
+! Subroutine arguments
+ Integer , intent (in)    :: ISIGN         ! Switch forward (-1) or inverse (+1)
+ Integer , intent (in)    :: INC           ! increment within each data
+                                           ! vector  (e.g. INC=1 for 
+                                           ! consecutively stored data)
+ Integer , intent (in)    :: Jump          ! increment between start of
+                                           ! data vectors
+ Integer , intent (in)    :: LOT           ! Number of data vectors
+ Integer , intent (in)    :: N             ! N+1 is the length of the data 
+                                           !  vectors  (which include zeros 
+                                           ! at the endpoints)
+ Integer , intent (in)    :: DIM           ! dimension workspace 
+ Integer , intent (in)    :: IFAX(10)      ! previously prepared list of 
+                                           ! factors of N
+ 
+ Real    , intent (in)    :: TRIGS(3*N)    ! previously prepared list of 
+                                           ! trigonometric function values
+ Real    , intent (inout) :: A( INC*(N+1) + JUMP*(LOT-1) ) ! data array
+
+                                                    ! No descriptions given
+ Integer                  :: NFAX,NX,NH
+ Integer                  :: NBLOX,NVEX,NB
+ Integer                  :: K,JA,JB,IA,IB,IGO,LA,J
+ Integer                  :: IFAC,IERR,ISTART
+
+ Real                     :: SI,T1,T2,SCALE, vectorlength
+ Real                     :: WORK(DIM)     ! size (n+1)*min(lot,VectorLength)
+     
+      VectorLength = LOT
+      NFAX=IFAX(1)
+      NX=N+1
+      NH=N/2
+      NBLOX=1+(LOT-1)/VectorLength
+      NVEX=LOT-(NBLOX-1)*VectorLength
+      ISTART=1
+!
+      DO 200 NB=1,NBLOX
+!
+!     PREPROCESSING
+!     -------------
+      DO 120 K=1,NH-1
+      JA=K+1
+      JB=N+1-K
+      IA=ISTART+K*INC
+      IB=ISTART+(JB-1)*INC
+      SI=TRIGS(2*N+K)
+      IF (MOD(NFAX,2).EQ.0) THEN
+!DIR$ IVDEP
+         DO 110 J=1,NVEX
+         WORK(JA) = SI*(A(IA)+A(IB)) + 0.5*(A(IA)-A(IB))
+         WORK(JB) = SI*(A(IA)+A(IB)) - 0.5*(A(IA)-A(IB))
+         IA=IA+JUMP
+         IB=IB+JUMP
+         JA=JA+NX
+         JB=JB+NX
+  110    CONTINUE
+      ELSE
+!DIR$ IVDEP
+         DO 115 J=1,NVEX
+         T1 = SI*(A(IA)+A(IB)) + 0.5*(A(IA)-A(IB))
+         T2 = SI*(A(IA)+A(IB)) - 0.5*(A(IA)-A(IB))
+         A(IA) = T1
+         A(IB) = T2
+         IA=IA+JUMP
+         IB=IB+JUMP
+  115    CONTINUE
+      ENDIF
+  120 CONTINUE
+
+      JA=1
+      JB=NH+1
+      IA=ISTART
+      IB=ISTART+NH*INC
+      IF (MOD(NFAX,2).EQ.0) THEN
+!DIR$ IVDEP
+         DO 130 J=1,NVEX
+         WORK(JA)=0.0
+         WORK(JB)=2.0*A(IB)
+         IB=IB+JUMP
+         JA=JA+NX
+         JB=JB+NX
+  130    CONTINUE
+         IGO = +1
+      ELSE
+!DIR$ IVDEP
+         DO 135 J=1,NVEX
+         A(IA)=0.0
+         A(IB)=2.0*A(IB)
+         IA=IA+JUMP
+         IB=IB+JUMP
+  135    CONTINUE
+         IGO = -1
+      ENDIF
+!
+!     PERIODIC FOURIER ANALYSIS
+!     -------------------------
+      IA=ISTART
+      LA=N
+!
+      DO 140 K=1,NFAX
+      IFAC=IFAX(NFAX+2-K)
+      LA=LA/IFAC
+      IERR=-1
+      IF (IGO.EQ.+1) THEN
+        CALL qpassm(WORK,WORK(IFAC*LA+1),A(IA),A(LA*INC+IA), &
+                    TRIGS,1,INC,NX,JUMP,NVEX,N,IFAC,LA,IERR)
+      ELSE IF (IGO.EQ.-1) THEN
+        CALL qpassm(A(IA),A(IFAC*LA*INC+IA),WORK,WORK(LA+1), &
+                    TRIGS,INC,1,JUMP,NX,NVEX,N,IFAC,LA,IERR)
+      ENDIF
+      IF (IERR.NE.0) GO TO 500
+      IGO=-IGO
+  140 CONTINUE
+!
+!     POSTPROCESSING
+!     --------------
+      SCALE=2.0
+      IF (ISIGN.EQ.+1) SCALE = FLOAT(N)
+      JA=ISTART
+      JB=JA+N*INC
+      IA=1
+!DIR$ IVDEP
+      DO 150 J=1,NVEX
+      A(JA)=0.0
+      A(JA+INC)=0.5*SCALE*WORK(IA)
+      A(JB)=0.0
+      IA=IA+NX
+      JA=JA+JUMP
+      JB=JB+JUMP
+  150 CONTINUE
+!
+      DO 170 K=2,N-2,2
+      JA=ISTART+K*INC
+      IA=K
+!DIR$ IVDEP
+      DO 160 J=1,NVEX
+      A(JA)=-SCALE*WORK(IA+1)
+      A(JA+INC)=SCALE*WORK(IA)+A(JA-INC)
+      IA=IA+NX
+      JA=JA+JUMP
+  160 CONTINUE
+  170 CONTINUE
+!
+      ISTART=ISTART+NVEX*JUMP
+      NVEX=VectorLength
+  200 CONTINUE
+
+      Go To 570
+!
+!     ERROR MESSAGES
+!     --------------
+  500 CONTINUE
+      GO TO (510,530,550) IERR
+  510 CONTINUE
+      WRITE(UNIT=0,FMT='(A,I5,A)') 'NVEX=', NVEX ,' GREATER THAN VectorLength'
+      GO TO 570
+  530 CONTINUE
+      WRITE(UNIT=0,FMT='(A,I5,A)') 'IFAC=', IFAC, 'NOT CATERED FOR'
+      GO TO 570
+  550 CONTINUE
+      WRITE(UNIT=0,FMT='(A,I5,A)') 'IFAC=', IFAC, ' ONLY CATERED FOR IF LA*IFAC=N'
+  570 CONTINUE
+
+
+      RETURN
+
+
+End subroutine FFT661
+
+
+!C     SUBROUTINE 'QPASSM' - PERFORMS ONE PASS THROUGH DATA AS PART!C     OF MULTIPLE REAL FFT (FOURIER ANALYSIS) ROUTINE
+!C
+!C     A IS FIRST REAL INPUT VECTOR
+!C         EQUIVALENCE B(1) WITH A(IFAC*LA*INC1+1)
+!C     C IS FIRST REAL OUTPUT VECTOR
+!C         EQUIVALENCE D(1) WITH C(LA*INC2+1)
+!C     TRIGS IS A PRECALCULATED LIST OF SINES & COSINES
+!C     INC1 IS THE ADDRESSING INCREMENT FOR A
+!C     INC2 IS THE ADDRESSING INCREMENT FOR C
+!C     INC3 IS THE INCREMENT BETWEEN INPUT VECTORS A
+!C     INC4 IS THE INCREMENT BETWEEN OUTPUT VECTORS C
+!C     LOT IS THE NUMBER OF VECTORS
+!C     N IS THE LENGTH OF THE VECTORS
+!C     IFAC IS THE CURRENT FACTOR OF N
+!C     LA = N/(PRODUCT OF FACTORS USED SO FAR)
+!C     IERR IS AN ERROR INDICATOR:
+!C              0 - PASS COMPLETED WITHOUT ERROR
+!C              1 - LOT GREATER THAN VectorLength
+!C              2 - IFAC NOT CATERED FOR
+!C              3 - IFAC ONLY CATERED FOR IF LA=N/IFAC
+!C
+!C-----------------------------------------------------------------------
+!C
+     SUBROUTINE QPASSM(A,B,C,D,TRIGS,INC1,INC2,INC3,INC4,LOT,N,IFAC,LA,IERR)
+
+      INTEGER :: inc1, inc2, inc3, inc4, lot, n, ifac, la, ierr
+
+      REAL  :: a, b, c, d, trigs
+      DIMENSION A(*),B(*),C(*),D(*),TRIGS(N)
+! Local named constants
+ CHARACTER (LEN=*), PARAMETER :: RoutineName = "QPASSM"
+!
+      REAL  :: SIN36, SIN72, QRT5, SIN60
+      DATA SIN36/0.587785252292473/,SIN72/0.951056516295154/, &
+          QRT5/0.559016994374947/,SIN60/0.866025403784437/
+
+      REAL    :: s1, s2, s3, s4, s5
+      REAL    :: sin45, zsin36, zsin72, zqrt5, zsin60, zsin45, z
+      REAL    :: a0, a1, a2, a3, a4, a5, a6, a10, a11, a20, a21
+      REAL    :: b0, b1, b2, b3, b4, b5, b6, b10, b11, b20, b21
+!      REAL    :: c0, c1, c2, c3, c4, c5
+      REAL    :: c1, c2, c3, c4, c5
+      INTEGER :: i, ijk, l, k, kb, m, iink, jink, ijump, kstop
+      INTEGER :: ibad, igo, ia, ie, je, ibase, jbase, ja, jb, j, ic
+      INTEGER :: if, jf, kf, ib, jc, kc, id, jd, kd, ke, ig, ih
+      INTEGER :: vectorlength
+!
+!- End of header ---------------------------------------------------------------
+
+
+      M=N/IFAC
+      IINK=LA*INC1
+      JINK=LA*INC2
+      IJUMP=(IFAC-1)*IINK
+      KSTOP=(N-IFAC)/(2*IFAC)
+!
+      IBAD=1
+      VectorLength = lot
+      IF (LOT.GT.VectorLength) GO TO 910
+      IBASE=0
+      JBASE=0
+      IGO=IFAC-1
+      IF (IGO.EQ.7) IGO=6
+      IBAD=2
+      IF (IGO.LT.1.OR.IGO.GT.6) GO TO 910
+      GO TO (200,300,400,500,600,800),IGO
+!
+!     CODING FOR FACTOR 2
+!     -------------------
+  200 CONTINUE
+      IA=1
+      IB=IA+IINK
+      JA=1
+      JB=JA+(2*M-LA)*INC2
+!
+      IF (LA.EQ.M) GO TO 290
+!
+      DO 220 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 210 IJK=1,LOT
+      C(JA+J)=A(IA+I)+A(IB+I)
+      C(JB+J)=A(IA+I)-A(IB+I)
+      I=I+INC3
+      J=J+INC4
+  210 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  220 CONTINUE
+      JA=JA+JINK
+      JINK=2*JINK
+      JB=JB-JINK
+      IBASE=IBASE+IJUMP
+      IJUMP=2*IJUMP+IINK
+      IF (JA.EQ.JB) GO TO 260
+      DO 250 K=LA,KSTOP,LA
+      KB=K+K
+      C1=TRIGS(KB+1)
+      S1=TRIGS(KB+2)
+      JBASE=0
+      DO 240 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 230 IJK=1,LOT
+      C(JA+J)=A(IA+I)+(C1*A(IB+I)+S1*B(IB+I))
+      C(JB+J)=A(IA+I)-(C1*A(IB+I)+S1*B(IB+I))
+      D(JA+J)=(C1*B(IB+I)-S1*A(IB+I))+B(IA+I)
+      D(JB+J)=(C1*B(IB+I)-S1*A(IB+I))-B(IA+I)
+      I=I+INC3
+      J=J+INC4
+  230 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  240 CONTINUE
+      IBASE=IBASE+IJUMP
+      JA=JA+JINK
+      JB=JB-JINK
+  250 CONTINUE
+      IF (JA.GT.JB) GO TO 900
+  260 CONTINUE
+      JBASE=0
+      DO 280 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 270 IJK=1,LOT
+      C(JA+J)=A(IA+I)
+      D(JA+J)=-A(IB+I)
+      I=I+INC3
+      J=J+INC4
+  270 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  280 CONTINUE
+      GO TO 900
+!
+  290 CONTINUE
+      Z=1.0/FLOAT(N)
+      DO 294 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 292 IJK=1,LOT
+      C(JA+J)=Z*(A(IA+I)+A(IB+I))
+      C(JB+J)=Z*(A(IA+I)-A(IB+I))
+      I=I+INC3
+      J=J+INC4
+  292 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  294 CONTINUE
+      GO TO 900
+!
+!     CODING FOR FACTOR 3
+!     -------------------
+  300 CONTINUE
+      IA=1
+      IB=IA+IINK
+      IC=IB+IINK
+      JA=1
+      JB=JA+(2*M-LA)*INC2
+      JC=JB
+!
+      IF (LA.EQ.M) GO TO 390
+!
+      DO 320 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 310 IJK=1,LOT
+      C(JA+J)=A(IA+I)+(A(IB+I)+A(IC+I))
+      C(JB+J)=A(IA+I)-0.5*(A(IB+I)+A(IC+I))
+      D(JB+J)=SIN60*(A(IC+I)-A(IB+I))
+      I=I+INC3
+      J=J+INC4
+  310 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  320 CONTINUE
+      JA=JA+JINK
+      JINK=2*JINK
+      JB=JB+JINK
+      JC=JC-JINK
+      IBASE=IBASE+IJUMP
+      IJUMP=2*IJUMP+IINK
+      IF (JA.EQ.JC) GO TO 360
+      DO 350 K=LA,KSTOP,LA
+      KB=K+K
+      KC=KB+KB
+      C1=TRIGS(KB+1)
+      S1=TRIGS(KB+2)
+      C2=TRIGS(KC+1)
+      S2=TRIGS(KC+2)
+      JBASE=0
+      DO 340 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 330 IJK=1,LOT
+      A1=(C1*A(IB+I)+S1*B(IB+I))+(C2*A(IC+I)+S2*B(IC+I))
+      B1=(C1*B(IB+I)-S1*A(IB+I))+(C2*B(IC+I)-S2*A(IC+I))
+      A2=A(IA+I)-0.5*A1
+      B2=B(IA+I)-0.5*B1
+      A3=SIN60*((C1*A(IB+I)+S1*B(IB+I))-(C2*A(IC+I)+S2*B(IC+I)))
+      B3=SIN60*((C1*B(IB+I)-S1*A(IB+I))-(C2*B(IC+I)-S2*A(IC+I)))
+      C(JA+J)=A(IA+I)+A1
+      D(JA+J)=B(IA+I)+B1
+      C(JB+J)=A2+B3
+      D(JB+J)=B2-A3
+      C(JC+J)=A2-B3
+      D(JC+J)=-(B2+A3)
+      I=I+INC3
+      J=J+INC4
+  330 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  340 CONTINUE
+      IBASE=IBASE+IJUMP
+      JA=JA+JINK
+      JB=JB+JINK
+      JC=JC-JINK
+  350 CONTINUE
+      IF (JA.GT.JC) GO TO 900
+  360 CONTINUE
+      JBASE=0
+      DO 380 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 370 IJK=1,LOT
+      C(JA+J)=A(IA+I)+0.5*(A(IB+I)-A(IC+I))
+      D(JA+J)=-SIN60*(A(IB+I)+A(IC+I))
+      C(JB+J)=A(IA+I)-(A(IB+I)-A(IC+I))
+      I=I+INC3
+      J=J+INC4
+  370 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  380 CONTINUE
+      GO TO 900
+!
+  390 CONTINUE
+      Z=1.0/FLOAT(N)
+      ZSIN60=Z*SIN60
+      DO 394 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 392 IJK=1,LOT
+      C(JA+J)=Z*(A(IA+I)+(A(IB+I)+A(IC+I)))
+      C(JB+J)=Z*(A(IA+I)-0.5*(A(IB+I)+A(IC+I)))
+      D(JB+J)=ZSIN60*(A(IC+I)-A(IB+I))
+      I=I+INC3
+      J=J+INC4
+  392 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  394 CONTINUE
+      GO TO 900
+!
+!     CODING FOR FACTOR 4
+!     -------------------
+  400 CONTINUE
+      IA=1
+      IB=IA+IINK
+      IC=IB+IINK
+      ID=IC+IINK
+      JA=1
+      JB=JA+(2*M-LA)*INC2
+      JC=JB+2*M*INC2
+      JD=JB
+!
+      IF (LA.EQ.M) GO TO 490
+!
+      DO 420 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 410 IJK=1,LOT
+      C(JA+J)=(A(IA+I)+A(IC+I))+(A(IB+I)+A(ID+I))
+      C(JC+J)=(A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I))
+      C(JB+J)=A(IA+I)-A(IC+I)
+      D(JB+J)=A(ID+I)-A(IB+I)
+      I=I+INC3
+      J=J+INC4
+  410 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  420 CONTINUE
+      JA=JA+JINK
+      JINK=2*JINK
+      JB=JB+JINK
+      JC=JC-JINK
+      JD=JD-JINK
+      IBASE=IBASE+IJUMP
+      IJUMP=2*IJUMP+IINK
+      IF (JB.EQ.JC) GO TO 460
+      DO 450 K=LA,KSTOP,LA
+      KB=K+K
+      KC=KB+KB
+      KD=KC+KB
+      C1=TRIGS(KB+1)
+      S1=TRIGS(KB+2)
+      C2=TRIGS(KC+1)
+      S2=TRIGS(KC+2)
+      C3=TRIGS(KD+1)
+      S3=TRIGS(KD+2)
+      JBASE=0
+      DO 440 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 430 IJK=1,LOT
+      A0=A(IA+I)+(C2*A(IC+I)+S2*B(IC+I))
+      A2=A(IA+I)-(C2*A(IC+I)+S2*B(IC+I))
+      A1=(C1*A(IB+I)+S1*B(IB+I))+(C3*A(ID+I)+S3*B(ID+I))
+      A3=(C1*A(IB+I)+S1*B(IB+I))-(C3*A(ID+I)+S3*B(ID+I))
+      B0=B(IA+I)+(C2*B(IC+I)-S2*A(IC+I))
+      B2=B(IA+I)-(C2*B(IC+I)-S2*A(IC+I))
+      B1=(C1*B(IB+I)-S1*A(IB+I))+(C3*B(ID+I)-S3*A(ID+I))
+      B3=(C1*B(IB+I)-S1*A(IB+I))-(C3*B(ID+I)-S3*A(ID+I))
+      C(JA+J)=A0+A1
+      C(JC+J)=A0-A1
+      D(JA+J)=B0+B1
+      D(JC+J)=B1-B0
+      C(JB+J)=A2+B3
+      C(JD+J)=A2-B3
+      D(JB+J)=B2-A3
+      D(JD+J)=-(B2+A3)
+      I=I+INC3
+      J=J+INC4
+  430 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  440 CONTINUE
+      IBASE=IBASE+IJUMP
+      JA=JA+JINK
+      JB=JB+JINK
+      JC=JC-JINK
+      JD=JD-JINK
+  450 CONTINUE
+      IF (JB.GT.JC) GO TO 900
+  460 CONTINUE
+      SIN45=SQRT(0.5)
+      JBASE=0
+      DO 480 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 470 IJK=1,LOT
+      C(JA+J)=A(IA+I)+SIN45*(A(IB+I)-A(ID+I))
+      C(JB+J)=A(IA+I)-SIN45*(A(IB+I)-A(ID+I))
+      D(JA+J)=-A(IC+I)-SIN45*(A(IB+I)+A(ID+I))
+      D(JB+J)=A(IC+I)-SIN45*(A(IB+I)+A(ID+I))
+      I=I+INC3
+      J=J+INC4
+  470 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  480 CONTINUE
+      GO TO 900
+!
+  490 CONTINUE
+      Z=1.0/FLOAT(N)
+      DO 494 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 492 IJK=1,LOT
+      C(JA+J)=Z*((A(IA+I)+A(IC+I))+(A(IB+I)+A(ID+I)))
+      C(JC+J)=Z*((A(IA+I)+A(IC+I))-(A(IB+I)+A(ID+I)))
+      C(JB+J)=Z*(A(IA+I)-A(IC+I))
+      D(JB+J)=Z*(A(ID+I)-A(IB+I))
+      I=I+INC3
+      J=J+INC4
+  492 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  494 CONTINUE
+      GO TO 900
+!
+!     CODING FOR FACTOR 5
+!     -------------------
+  500 CONTINUE
+      IA=1
+      IB=IA+IINK
+      IC=IB+IINK
+      ID=IC+IINK
+      IE=ID+IINK
+      JA=1
+      JB=JA+(2*M-LA)*INC2
+      JC=JB+2*M*INC2
+      JD=JC
+      JE=JB
+!
+      IF (LA.EQ.M) GO TO 590
+!
+      DO 520 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 510 IJK=1,LOT
+      A1=A(IB+I)+A(IE+I)
+      A3=A(IB+I)-A(IE+I)
+      A2=A(IC+I)+A(ID+I)
+      A4=A(IC+I)-A(ID+I)
+      A5=A(IA+I)-0.25*(A1+A2)
+      A6=QRT5*(A1-A2)
+      C(JA+J)=A(IA+I)+(A1+A2)
+      C(JB+J)=A5+A6
+      C(JC+J)=A5-A6
+      D(JB+J)=-SIN72*A3-SIN36*A4
+      D(JC+J)=-SIN36*A3+SIN72*A4
+      I=I+INC3
+      J=J+INC4
+  510 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  520 CONTINUE
+      JA=JA+JINK
+      JINK=2*JINK
+      JB=JB+JINK
+      JC=JC+JINK
+      JD=JD-JINK
+      JE=JE-JINK
+      IBASE=IBASE+IJUMP
+      IJUMP=2*IJUMP+IINK
+      IF (JB.EQ.JD) GO TO 560
+      DO 550 K=LA,KSTOP,LA
+      KB=K+K
+      KC=KB+KB
+      KD=KC+KB
+      KE=KD+KB
+      C1=TRIGS(KB+1)
+      S1=TRIGS(KB+2)
+      C2=TRIGS(KC+1)
+      S2=TRIGS(KC+2)
+      C3=TRIGS(KD+1)
+      S3=TRIGS(KD+2)
+      C4=TRIGS(KE+1)
+      S4=TRIGS(KE+2)
+      JBASE=0
+      DO 540 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 530 IJK=1,LOT
+      A1=(C1*A(IB+I)+S1*B(IB+I))+(C4*A(IE+I)+S4*B(IE+I))
+      A3=(C1*A(IB+I)+S1*B(IB+I))-(C4*A(IE+I)+S4*B(IE+I))
+      A2=(C2*A(IC+I)+S2*B(IC+I))+(C3*A(ID+I)+S3*B(ID+I))
+      A4=(C2*A(IC+I)+S2*B(IC+I))-(C3*A(ID+I)+S3*B(ID+I))
+      B1=(C1*B(IB+I)-S1*A(IB+I))+(C4*B(IE+I)-S4*A(IE+I))
+      B3=(C1*B(IB+I)-S1*A(IB+I))-(C4*B(IE+I)-S4*A(IE+I))
+      B2=(C2*B(IC+I)-S2*A(IC+I))+(C3*B(ID+I)-S3*A(ID+I))
+      B4=(C2*B(IC+I)-S2*A(IC+I))-(C3*B(ID+I)-S3*A(ID+I))
+      A5=A(IA+I)-0.25*(A1+A2)
+      A6=QRT5*(A1-A2)
+      B5=B(IA+I)-0.25*(B1+B2)
+      B6=QRT5*(B1-B2)
+      A10=A5+A6
+      A20=A5-A6
+      B10=B5+B6
+      B20=B5-B6
+      A11=SIN72*B3+SIN36*B4
+      A21=SIN36*B3-SIN72*B4
+      B11=SIN72*A3+SIN36*A4
+      B21=SIN36*A3-SIN72*A4
+      C(JA+J)=A(IA+I)+(A1+A2)
+      C(JB+J)=A10+A11
+      C(JE+J)=A10-A11
+      C(JC+J)=A20+A21
+      C(JD+J)=A20-A21
+      D(JA+J)=B(IA+I)+(B1+B2)
+      D(JB+J)=B10-B11
+      D(JE+J)=-(B10+B11)
+      D(JC+J)=B20-B21
+      D(JD+J)=-(B20+B21)
+      I=I+INC3
+      J=J+INC4
+  530 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  540 CONTINUE
+      IBASE=IBASE+IJUMP
+      JA=JA+JINK
+      JB=JB+JINK
+      JC=JC+JINK
+      JD=JD-JINK
+      JE=JE-JINK
+  550 CONTINUE
+      IF (JB.GT.JD) GO TO 900
+  560 CONTINUE
+      JBASE=0
+      DO 580 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 570 IJK=1,LOT
+      A1=A(IB+I)+A(IE+I)
+      A3=A(IB+I)-A(IE+I)
+      A2=A(IC+I)+A(ID+I)
+      A4=A(IC+I)-A(ID+I)
+      A5=A(IA+I)+0.25*(A3-A4)
+      A6=QRT5*(A3+A4)
+      C(JA+J)=A5+A6
+      C(JB+J)=A5-A6
+      C(JC+J)=A(IA+I)-(A3-A4)
+      D(JA+J)=-SIN36*A1-SIN72*A2
+      D(JB+J)=-SIN72*A1+SIN36*A2
+      I=I+INC3
+      J=J+INC4
+  570 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  580 CONTINUE
+      GO TO 900
+!
+  590 CONTINUE
+      Z=1.0/FLOAT(N)
+      ZQRT5=Z*QRT5
+      ZSIN36=Z*SIN36
+      ZSIN72=Z*SIN72
+      DO 594 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 592 IJK=1,LOT
+      A1=A(IB+I)+A(IE+I)
+      A3=A(IB+I)-A(IE+I)
+      A2=A(IC+I)+A(ID+I)
+      A4=A(IC+I)-A(ID+I)
+      A5=Z*(A(IA+I)-0.25*(A1+A2))
+      A6=ZQRT5*(A1-A2)
+      C(JA+J)=Z*(A(IA+I)+(A1+A2))
+      C(JB+J)=A5+A6
+      C(JC+J)=A5-A6
+      D(JB+J)=-ZSIN72*A3-ZSIN36*A4
+      D(JC+J)=-ZSIN36*A3+ZSIN72*A4
+      I=I+INC3
+      J=J+INC4
+  592 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  594 CONTINUE
+      GO TO 900
+!
+!     CODING FOR FACTOR 6
+!     -------------------
+  600 CONTINUE
+      IA=1
+      IB=IA+IINK
+      IC=IB+IINK
+      ID=IC+IINK
+      IE=ID+IINK
+      IF=IE+IINK
+      JA=1
+      JB=JA+(2*M-LA)*INC2
+      JC=JB+2*M*INC2
+      JD=JC+2*M*INC2
+      JE=JC
+      JF=JB
+!
+      IF (LA.EQ.M) GO TO 690
+!
+      DO 620 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 610 IJK=1,LOT
+      A11=(A(IC+I)+A(IF+I))+(A(IB+I)+A(IE+I))
+      C(JA+J)=(A(IA+I)+A(ID+I))+A11
+      C(JC+J)=(A(IA+I)+A(ID+I)-0.5*A11)
+      D(JC+J)=SIN60*((A(IC+I)+A(IF+I))-(A(IB+I)+A(IE+I)))
+      A11=(A(IC+I)-A(IF+I))+(A(IE+I)-A(IB+I))
+      C(JB+J)=(A(IA+I)-A(ID+I))-0.5*A11
+      D(JB+J)=SIN60*((A(IE+I)-A(IB+I))-(A(IC+I)-A(IF+I)))
+      C(JD+J)=(A(IA+I)-A(ID+I))+A11
+      I=I+INC3
+      J=J+INC4
+  610 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  620 CONTINUE
+      JA=JA+JINK
+      JINK=2*JINK
+      JB=JB+JINK
+      JC=JC+JINK
+      JD=JD-JINK
+      JE=JE-JINK
+      JF=JF-JINK
+      IBASE=IBASE+IJUMP
+      IJUMP=2*IJUMP+IINK
+      IF (JC.EQ.JD) GO TO 660
+      DO 650 K=LA,KSTOP,LA
+      KB=K+K
+      KC=KB+KB
+      KD=KC+KB
+      KE=KD+KB
+      KF=KE+KB
+      C1=TRIGS(KB+1)
+      S1=TRIGS(KB+2)
+      C2=TRIGS(KC+1)
+      S2=TRIGS(KC+2)
+      C3=TRIGS(KD+1)
+      S3=TRIGS(KD+2)
+      C4=TRIGS(KE+1)
+      S4=TRIGS(KE+2)
+      C5=TRIGS(KF+1)
+      S5=TRIGS(KF+2)
+      JBASE=0
+      DO 640 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 630 IJK=1,LOT
+      A1=C1*A(IB+I)+S1*B(IB+I)
+      B1=C1*B(IB+I)-S1*A(IB+I)
+      A2=C2*A(IC+I)+S2*B(IC+I)
+      B2=C2*B(IC+I)-S2*A(IC+I)
+      A3=C3*A(ID+I)+S3*B(ID+I)
+      B3=C3*B(ID+I)-S3*A(ID+I)
+      A4=C4*A(IE+I)+S4*B(IE+I)
+      B4=C4*B(IE+I)-S4*A(IE+I)
+      A5=C5*A(IF+I)+S5*B(IF+I)
+      B5=C5*B(IF+I)-S5*A(IF+I)
+      A11=(A2+A5)+(A1+A4)
+      A20=(A(IA+I)+A3)-0.5*A11
+      A21=SIN60*((A2+A5)-(A1+A4))
+      B11=(B2+B5)+(B1+B4)
+      B20=(B(IA+I)+B3)-0.5*B11
+      B21=SIN60*((B2+B5)-(B1+B4))
+      C(JA+J)=(A(IA+I)+A3)+A11
+      D(JA+J)=(B(IA+I)+B3)+B11
+      C(JC+J)=A20-B21
+      D(JC+J)=A21+B20
+      C(JE+J)=A20+B21
+      D(JE+J)=A21-B20
+      A11=(A2-A5)+(A4-A1)
+      A20=(A(IA+I)-A3)-0.5*A11
+      A21=SIN60*((A4-A1)-(A2-A5))
+      B11=(B5-B2)-(B4-B1)
+      B20=(B3-B(IA+I))-0.5*B11
+      B21=SIN60*((B5-B2)+(B4-B1))
+      C(JB+J)=A20-B21
+      D(JB+J)=A21-B20
+      C(JD+J)=A11+(A(IA+I)-A3)
+      D(JD+J)=B11+(B3-B(IA+I))
+      C(JF+J)=A20+B21
+      D(JF+J)=A21+B20
+      I=I+INC3
+      J=J+INC4
+  630 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  640 CONTINUE
+      IBASE=IBASE+IJUMP
+      JA=JA+JINK
+      JB=JB+JINK
+      JC=JC+JINK
+      JD=JD-JINK
+      JE=JE-JINK
+      JF=JF-JINK
+  650 CONTINUE
+      IF (JC.GT.JD) GO TO 900
+  660 CONTINUE
+      JBASE=0
+      DO 680 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 670 IJK=1,LOT
+      C(JA+J)=(A(IA+I)+0.5*(A(IC+I)-A(IE+I)))+ SIN60*(A(IB+I)-A(IF+I))
+      D(JA+J)=-(A(ID+I)+0.5*(A(IB+I)+A(IF+I)))-SIN60*(A(IC+I)+A(IE+I))
+      C(JB+J)=A(IA+I)-(A(IC+I)-A(IE+I))
+      D(JB+J)=A(ID+I)-(A(IB+I)+A(IF+I))
+      C(JC+J)=(A(IA+I)+0.5*(A(IC+I)-A(IE+I)))-SIN60*(A(IB+I)-A(IF+I))
+      D(JC+J)=-(A(ID+I)+0.5*(A(IB+I)+A(IF+I)))+SIN60*(A(IC+I)+A(IE+I))
+      I=I+INC3
+      J=J+INC4
+  670 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  680 CONTINUE
+      GO TO 900
+!
+  690 CONTINUE
+      Z=1.0/FLOAT(N)
+      ZSIN60=Z*SIN60
+      DO 694 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 692 IJK=1,LOT
+      A11=(A(IC+I)+A(IF+I))+(A(IB+I)+A(IE+I))
+      C(JA+J)=Z*((A(IA+I)+A(ID+I))+A11)
+      C(JC+J)=Z*((A(IA+I)+A(ID+I))-0.5*A11)
+      D(JC+J)=ZSIN60*((A(IC+I)+A(IF+I))-(A(IB+I)+A(IE+I)))
+      A11=(A(IC+I)-A(IF+I))+(A(IE+I)-A(IB+I))
+      C(JB+J)=Z*((A(IA+I)-A(ID+I))-0.5*A11)
+      D(JB+J)=ZSIN60*((A(IE+I)-A(IB+I))-(A(IC+I)-A(IF+I)))
+      C(JD+J)=Z*((A(IA+I)-A(ID+I))+A11)
+      I=I+INC3
+      J=J+INC4
+  692 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  694 CONTINUE
+      GO TO 900
+!
+!     CODING FOR FACTOR 8
+!     -------------------
+  800 CONTINUE
+      IBAD=3
+      IF (LA.NE.M) GO TO 910
+      IA=1
+      IB=IA+IINK
+      IC=IB+IINK
+      ID=IC+IINK
+      IE=ID+IINK
+      IF=IE+IINK
+      IG=IF+IINK
+      IH=IG+IINK
+      JA=1
+      JB=JA+LA*INC2
+      JC=JB+2*M*INC2
+      JD=JC+2*M*INC2
+      JE=JD+2*M*INC2
+      Z=1.0/FLOAT(N)
+      ZSIN45=Z*SQRT(0.5)
+!
+      DO 820 L=1,LA
+      I=IBASE
+      J=JBASE
+!DIR$ IVDEP
+      DO 810 IJK=1,LOT
+      C(JA+J)=Z*(((A(IA+I)+A(IE+I))+(A(IC+I)+A(IG+I)))+ &
+         ((A(ID+I)+A(IH+I))+(A(IB+I)+A(IF+I))))
+      C(JE+J)=Z*(((A(IA+I)+A(IE+I))+(A(IC+I)+A(IG+I)))- &
+         ((A(ID+I)+A(IH+I))+(A(IB+I)+A(IF+I))))
+      C(JC+J)=Z*((A(IA+I)+A(IE+I))-(A(IC+I)+A(IG+I)))
+      D(JC+J)=Z*((A(ID+I)+A(IH+I))-(A(IB+I)+A(IF+I)))
+      C(JB+J)=Z*(A(IA+I)-A(IE+I))+ZSIN45*((A(IH+I)-A(ID+I))-(A(IF+I)-A(IB+I)))
+      C(JD+J)=Z*(A(IA+I)-A(IE+I))-ZSIN45*((A(IH+I)-A(ID+I))-(A(IF+I)-A(IB+I)))
+      D(JB+J)=ZSIN45*((A(IH+I)-A(ID+I))+(A(IF+I)-A(IB+I)))+Z*(A(IG+I)-A(IC+I))
+      D(JD+J)=ZSIN45*((A(IH+I)-A(ID+I))+(A(IF+I)-A(IB+I)))-Z*(A(IG+I)-A(IC+I))
+      I=I+INC3
+      J=J+INC4
+  810 CONTINUE
+      IBASE=IBASE+INC1
+      JBASE=JBASE+INC2
+  820 CONTINUE
+!
+
+  900 CONTINUE
+      IBAD=0
+  910 CONTINUE
+      IERR=IBAD
+
+      RETURN
+      END SUBROUTINE QPASSM
+
diff --git a/var/gen_be_v3/util/combine_be_cv5.f90 b/var/gen_be_v3/util/combine_be_cv5.f90
new file mode 100644
index 0000000000..75d9710190
--- /dev/null
+++ b/var/gen_be_v3/util/combine_be_cv5.f90
@@ -0,0 +1,293 @@
+program be_readwrite
+
+! gfortran -o combine_be_cv5.exe -fconvert=big-endian combine_be_cv5.f90
+
+! input:  be_[PSI|CHI_U|T_U|RH|PSFC_U].dat
+!         regcoeff[1|2|3].dat
+! output: be.dat (in real(8) and big_endian)
+
+   implicit none
+
+   integer               :: ni, nj, nk           ! dimensions read in
+   integer               :: bin_type             ! type of bin to average over
+   integer               :: num_bins             ! number of 3D bins
+   integer               :: num_bins2d           ! number of 2D bins
+   integer, allocatable  :: bin(:,:,:)           ! bin assigned to each 3D point
+   integer, allocatable  :: bin2d(:,:)           ! bin assigned to each 2D point
+
+!for be_[PSI|CHI_U|T_U|RH|PSFC_U].dat in real(4) when gen_be_v3 is run with write_be_dat_r8=.false.
+   !real,    allocatable  :: e_vec(:,:)           ! domain-averaged eigenvectors
+   !real,    allocatable  :: e_val(:)             ! domain-averaged eigenvalues
+   !real,    allocatable  :: e_vec_loc(:,:,:)     ! latitudinally varying eigenvectors
+   !real,    allocatable  :: e_val_loc(:,:)       ! latitudinally varying eigenvalues
+   !real,    allocatable  :: scale_length(:)      ! scale length for regional application
+!for be_[PSI|CHI_U|T_U|RH|PSFC_U].dat in real(8) when gen_be_v3 is run with write_be_dat_r8=.true.(default)
+   real*8,  allocatable  :: e_vec(:,:)           ! domain-averaged eigenvectors
+   real*8,  allocatable  :: e_val(:)             ! domain-averaged eigenvalues
+   real*8,  allocatable  :: e_vec_loc(:,:,:)     ! latitudinally varying eigenvectors
+   real*8,  allocatable  :: e_val_loc(:,:)       ! latitudinally varying eigenvalues
+   real*8,  allocatable  :: scale_length(:)      ! scale length for regional application
+
+   real*8,  allocatable  :: e_vec_r8(:,:)        ! domain-averaged eigenvectors
+   real*8,  allocatable  :: e_val_r8(:)          ! domain-averaged eigenvalues
+   real*8,  allocatable  :: e_vec_loc_r8(:,:,:)  ! latitudinally varying eigenvectors
+   real*8,  allocatable  :: e_val_loc_r8(:,:)    ! latitudinally varying eigenvalues
+   real*8,  allocatable  :: scale_length_r8(:)   ! scale length for regional application
+
+   real,    allocatable  :: regcoeff1(:)         ! psi/chi regcoeff
+   real,    allocatable  :: regcoeff2(:,:)       ! psi/ps regcoeff
+   real,    allocatable  :: regcoeff3(:,:,:)     ! psi/t regcoeff
+   real*8,  allocatable  :: regcoeff1_r8(:)      ! psi/chi regcoeff
+   real*8,  allocatable  :: regcoeff2_r8(:,:)    ! psi/ps regcoeff
+   real*8,  allocatable  :: regcoeff3_r8(:,:,:)  ! psi/t regcoeff
+
+   character(len=256)    :: infile               ! input filename
+   character(len=256)    :: outfile              ! output filename
+   logical               :: isfile
+   integer               :: iunit, ounit
+
+   integer, parameter :: nvar = 5
+   character(len=10)  :: varnames(nvar) = (/ "PSI       ", &
+                                             "CHI_U     ", &
+                                             "T_U       ", &
+                                             "RH        ", &
+                                             "PSFC_U    " /)
+   character(len=10)  :: varout(nvar)   = (/ "psi       ", &
+                                             "chi_u     ", &
+                                             "t_u       ", &
+                                             "rh        ", &
+                                             "ps_u      " /)
+   character(len=10)  :: this_var
+   integer            :: var_dim
+   integer            :: i
+   integer            :: nk_2d = 1
+
+   real*8  :: lat_min, lat_max, binwidth_lat
+   real*8  :: hgt_min, hgt_max, binwidth_hgt
+
+   lat_min = 0.0
+   lat_max = 0.0
+   binwidth_lat = 0.0
+   hgt_min = 0.0
+   hgt_max = 0.0
+   binwidth_hgt = 0.0
+
+   iunit = 81
+   ounit = 82
+
+   outfile = 'be.dat'
+   open (ounit, file=trim(outfile), form='unformatted', status='replace')
+
+   ! read and write dimension and bin info
+
+   infile = 'be_'//trim(varnames(1))//'.dat'
+   inquire(file=trim(infile), exist=isfile)
+   if ( .not. isfile ) then
+      write(*,*) 'STOP: ', trim(infile), ' does not exist.'
+      stop
+   end if
+   open (iunit, file=trim(infile), form='unformatted', status='old')
+   write(*,*) 'Reading from ', trim(infile), ' for dimension and bin info'
+
+   read(iunit) ni, nj, nk
+
+   allocate( bin(1:ni,1:nj,1:nk) )
+   allocate( bin2d(1:ni,1:nj) )
+
+   read(iunit) bin_type
+   read(iunit) num_bins, num_bins2d
+   read(iunit) bin
+   read(iunit) bin2d
+   close(iunit)
+
+   write(ounit) ni, nj, nk
+   write(ounit) bin_type
+   write(ounit) lat_min, lat_max, binwidth_lat
+   write(ounit) hgt_min, hgt_max, binwidth_hgt
+   write(ounit) num_bins, num_bins2d
+   write(ounit) bin
+   write(ounit) bin2d
+
+   ! read and write regcoeff info
+
+   allocate( regcoeff1(1:num_bins) )
+   allocate( regcoeff2(1:nk,1:num_bins2d) )
+   allocate( regcoeff3(1:nk,1:nk,1:num_bins2d) )
+   allocate( regcoeff1_r8(1:num_bins) )
+   allocate( regcoeff2_r8(1:nk,1:num_bins2d) )
+   allocate( regcoeff3_r8(1:nk,1:nk,1:num_bins2d) )
+
+   infile = 'regcoeff1.dat'
+   inquire(file=trim(infile), exist=isfile)
+   if ( .not. isfile ) then
+      write(*,*) 'STOP: ', trim(infile), ' does not exist.'
+      stop
+   end if
+   open (iunit, file=trim(infile), form='unformatted', status='old')
+   write(*,*) 'Reading from ', trim(infile)
+   read(iunit) ni, nj, nk
+   read(iunit) num_bins, num_bins2d
+   read(iunit) regcoeff1
+   regcoeff1_r8(:) = regcoeff1(:)
+   write(ounit) regcoeff1_r8
+   close(iunit)
+
+   infile = 'regcoeff2.dat'
+   inquire(file=trim(infile), exist=isfile)
+   if ( .not. isfile ) then
+      write(*,*) 'STOP: ', trim(infile), ' does not exist.'
+      stop
+   end if
+   open (iunit, file=trim(infile), form='unformatted', status='old')
+   write(*,*) 'Reading from ', trim(infile)
+   read(iunit) ni, nj, nk
+   read(iunit) num_bins, num_bins2d
+   read(iunit) regcoeff2
+   regcoeff2_r8(:,:) = regcoeff2(:,:)
+   write(ounit) regcoeff2_r8
+   close(iunit)
+
+   infile = 'regcoeff3.dat'
+   inquire(file=trim(infile), exist=isfile)
+   if ( .not. isfile ) then
+      write(*,*) 'STOP: ', trim(infile), ' does not exist.'
+      stop
+   end if
+   open (iunit, file=trim(infile), form='unformatted', status='old')
+   write(*,*) 'Reading from ', trim(infile)
+   read(iunit) ni, nj, nk
+   read(iunit) num_bins, num_bins2d
+   read(iunit) regcoeff3
+   regcoeff3_r8(:,:,:) = regcoeff3(:,:,:)
+   write(ounit) regcoeff3_r8
+   close(iunit)
+
+   ! done writing dimension, bin and regcoeff info
+
+   deallocate( regcoeff1 )
+   deallocate( regcoeff2 )
+   deallocate( regcoeff3 )
+   deallocate( regcoeff1_r8 )
+   deallocate( regcoeff2_r8 )
+   deallocate( regcoeff3_r8 )
+
+   allocate( e_vec(1:nk,1:nk) )
+   allocate( e_val(1:nk) )
+   allocate( e_vec_loc(1:nk,1:nk,1:num_bins2d) )
+   allocate( e_val_loc(1:nk,1:num_bins2d) )
+   allocate( e_vec_r8(1:nk,1:nk) )
+   allocate( e_val_r8(1:nk) )
+   allocate( e_vec_loc_r8(1:nk,1:nk,1:num_bins2d) )
+   allocate( e_val_loc_r8(1:nk,1:num_bins2d) )
+
+   ! loop over individual files to gather and write out e_vec/e_val info
+
+   do i = 1, nvar
+
+      infile = 'be_'//trim(varnames(i))//'.dat'
+      inquire(file=trim(infile), exist=isfile)
+      if ( .not. isfile ) then
+         write(*,*) 'STOP: ', trim(infile), ' does not exist.'
+         stop
+      end if
+      open (iunit, file=trim(infile), form='unformatted', status='old')
+      write(*,*) 'Reading from ', trim(infile)
+
+      read(iunit) ni, nj, nk
+
+      read(iunit) bin_type
+      read(iunit) num_bins, num_bins2d
+      read(iunit) bin
+      read(iunit) bin2d
+
+      read(iunit) this_var
+      write(ounit) varout(i)
+
+      read(iunit) var_dim
+      if ( var_dim == 3 ) then
+         read(iunit) e_vec
+         read(iunit) e_val
+         read(iunit) e_vec_loc
+         read(iunit) e_val_loc
+         e_vec_r8 = e_vec
+         e_val_r8 = e_val
+         e_vec_loc_r8 = e_vec_loc
+         e_val_loc_r8 = e_val_loc
+         write(ounit) nk, num_bins2d
+         write(ounit) e_vec_r8
+         write(ounit) e_val_r8
+         write(ounit) e_vec_loc_r8
+         write(ounit) e_val_loc_r8
+      else if ( var_dim == 2 ) then
+         read(iunit) e_vec(1:1,1:1)
+         read(iunit) e_val(1:1)
+         read(iunit) e_vec_loc(1:1,1:1,:)
+         read(iunit) e_val_loc(1:1,:)
+         e_vec_r8 = e_vec
+         e_val_r8 = e_val
+         e_vec_loc_r8 = e_vec_loc
+         e_val_loc_r8 = e_val_loc
+         write(ounit) nk_2d, num_bins2d
+         write(ounit) e_vec_r8(1:1,1:1)
+         write(ounit) e_val_r8(1:1)
+         write(ounit) e_vec_loc_r8(1:1,1:1,:)
+         write(ounit) e_val_loc_r8(1:1,:)
+      end if
+      close(iunit)
+   end do ! var loop for e_val/e_vec
+
+   ! read individual files again to gather and write out lengthscales
+
+   allocate( scale_length(1:nk) )
+   allocate( scale_length_r8(1:nk) )
+
+   do i = 1, nvar
+      infile = 'be_'//trim(varnames(i))//'.dat'
+      open (iunit, file=trim(infile), form='unformatted', status='old')
+      !write(*,*) 'Reading from ', trim(infile)
+      read(iunit) ni, nj, nk
+      read(iunit) bin_type
+      read(iunit) num_bins, num_bins2d
+      read(iunit) bin
+      read(iunit) bin2d
+
+      read(iunit) this_var
+      write(ounit) varout(i)
+
+      read(iunit) var_dim
+      if ( var_dim == 3 ) then
+         read(iunit) e_vec
+         read(iunit) e_val
+         read(iunit) e_vec_loc
+         read(iunit) e_val_loc
+         read(iunit) scale_length
+         scale_length_r8 = scale_length
+         write(ounit) scale_length_r8
+      else if ( var_dim == 2 ) then
+         read(iunit) e_vec(1:1,1:1)
+         read(iunit) e_val(1:1)
+         read(iunit) e_vec_loc(1:1,1:1,:)
+         read(iunit) e_val_loc(1:1,:)
+         read(iunit) scale_length(1:1)
+         scale_length_r8 = scale_length
+         write(ounit) scale_length_r8(1:1)
+      end if
+      close(iunit)
+   end do
+   close(ounit)
+
+   deallocate( e_vec )
+   deallocate( e_val )
+   deallocate( e_vec_loc )
+   deallocate( e_val_loc )
+   deallocate( e_vec_r8 )
+   deallocate( e_val_r8 )
+   deallocate( e_vec_loc_r8 )
+   deallocate( e_val_loc_r8 )
+
+   deallocate (scale_length)
+   deallocate (scale_length_r8)
+
+   write(*,*) 'Done writing be.dat for cv_options=5'
+
+end program be_readwrite

From e9add3a71f215a56caddfb63f99fd478f22fc68b Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Sun, 23 Aug 2020 16:48:48 -0600
Subject: [PATCH 02/13] Add gen_be_v3 vertical localization option

Default is vertloc_opt=0, no vertical localization.
Set in namelist.gen_be_v3
&gen_be_nml
 vertloc_opt=1
to apply vertical localization using log-P algorithm.

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 183 +++++++++++++++++++++++++++++++++++-
 1 file changed, 182 insertions(+), 1 deletion(-)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index 64d5a567ca..c9bcf39944 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -55,7 +55,8 @@ program gen_be_v3
 
    namelist /gen_be_nml/ nc_list_file, be_method, varnames, outnames, &
       do_pert_calc, do_eof_transform, do_slen_calc, slen_opt, stride, yr_cutoff, &
-      verbose, level_start, level_end, aux_file, write_be_dat_r8, pert1_read_opt
+      verbose, level_start, level_end, aux_file, write_be_dat_r8, pert1_read_opt, &
+      vertloc_opt
    namelist /ens_nml/ nens, write_ep, write_ens_stdv, ep_format, nproc_out
 
    character(len=filename_len) :: nc_list_file  ! the text file that contains a list of netcdf files to process
@@ -83,6 +84,8 @@ program gen_be_v3
    integer :: pert1_read_opt         ! how to access pert1 data for compute_bv_sl
                                      ! 1: read and store all cases in memory at once
                                      ! 2: read from pert1 file when need it
+   integer :: vertloc_opt            ! 0: no vertical localization
+                                     ! 1: with vertical localization using log-P algorithm
    logical :: write_pert0
    logical :: write_pert1
    logical :: write_be_dat_r8        ! if writing out be.dat in real*8
@@ -100,6 +103,9 @@ program gen_be_v3
    logical, allocatable :: do_this_var(:)
    integer, allocatable :: var_dim(:)
 
+   real, allocatable :: vertloc_rho(:,:)
+   real, allocatable :: vertloc_kcutoff(:)
+
    logical :: read_t
    logical :: read_qv
    logical :: read_prs
@@ -207,6 +213,7 @@ program gen_be_v3
    aux_file = 'none'
    write_be_dat_r8 = .true.
    pert1_read_opt = 1
+   vertloc_opt = 0
 
    ! initialize internal options
    write_pert0 = .false.
@@ -335,6 +342,24 @@ program gen_be_v3
       call read_fixed_fields(trim(aux_file))
    end if
 
+   if ( vertloc_opt > 0 ) then
+      allocate(vertloc_rho(nk,nk))
+      allocate(vertloc_kcutoff(nk))
+      if ( trim(aux_file) == 'none' ) then
+         ! read vertical level info from file filenames(1,1)
+         aux_file = filenames(1,1)
+      end if
+      call get_vertloc(trim(aux_file), nk, vertloc_rho, vertloc_kcutoff)
+      if ( myproc == root ) then
+         write(stdout,*)
+         do k = 1,nk
+            write(stdout, '(a,i4,2x,f4.1)')   ' vertloc: k, kcutoff   = ', k, vertloc_kcutoff(k)
+            write(stdout, '(a,i4,100(f6.3))') ' vertloc: k, rho(k:nk) = ', k, vertloc_rho(k,k:nk)
+         end do
+         write(stdout,*)
+      end if
+   end if
+
    if ( do_pert_calc ) then
       call compute_pert
    end if
@@ -523,6 +548,11 @@ program gen_be_v3
    if ( allocated(do_this_var) ) deallocate(do_this_var)
    if ( allocated(var_dim) ) deallocate(var_dim)
 
+   if ( vertloc_opt > 0 ) then
+      if ( allocated(vertloc_rho) ) deallocate(vertloc_rho)
+      if ( allocated(vertloc_kcutoff) ) deallocate(vertloc_kcutoff)
+   end if
+
 if ( myproc == root ) write(stdout,'(a)')' All Done!'
 
 #ifdef DM_PARALLEL
@@ -949,6 +979,151 @@ subroutine read_fixed_fields_mapfac(input_file)
 
 end subroutine read_fixed_fields_mapfac
 
+subroutine get_vertloc(input_file, kz, rho, kcutoff)
+
+   implicit none
+
+   character(len=*),intent(in) :: input_file
+   integer, intent(in)         :: kz
+   real, intent(inout)         :: rho(kz,kz)
+   real, intent(inout)         :: kcutoff(kz)
+
+   integer                         :: fid, ierr
+   character(len=DateStrLen)       :: DateStr
+   character(len=80), dimension(3) :: dimnames
+   character(len=4)                :: staggering = ' N/A' !dummy
+   character(len=3)                :: ordering
+   integer, dimension(4)           :: start_index, end_index
+   integer                         :: ndim, wrftype
+   character(len=512)              :: message
+   character(len=VarNameLen)       :: varname
+   real(r_single), allocatable     :: xfield(:)
+
+   integer           :: k, k1, i
+   real              :: kscale, kscale_invsq, kdist
+   real              :: cutoff
+   real              :: p_top, base_pres
+   real, allocatable :: znw(:)
+   real, allocatable :: p_w(:)  ! pressure at full levels
+   real, allocatable :: ln_p_w(:)
+   real, allocatable :: dlnp(:,:)
+   real, allocatable :: kcutoff_tmp(:)
+   logical           :: smooth_kcutoff
+
+   call ext_ncd_open_for_read(trim(input_file), 0, 0, "", fid, ierr)
+   write(message,'(a,i3,a,a)') ' Proc ', myproc, ' opening ', trim(input_file)
+   call error_handler(ierr, trim(message))
+
+   call ext_ncd_get_next_time(fid, DateStr, ierr)
+   call error_handler(ierr, 'ext_ncd_get_next_time')
+
+   if ( .not. allocated(xfield) ) allocate (xfield(kz+1))
+   varname = 'P_TOP'
+   call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                              start_index, end_index, wrftype, ierr)
+   call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+   call ext_ncd_read_field(fid, DateStr, trim(varname),  &
+                           xfield(1), wrftype,           &
+                           0, 0, 0, ordering,            &
+                           staggering, dimnames,         & !dummy
+                           start_index, end_index,       & !dom
+                           start_index, end_index,       & !mem
+                           start_index, end_index,       & !pat
+                           ierr                   )
+   call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+   p_top = xfield(1)
+
+   varname = 'P00'
+   call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                              start_index, end_index, wrftype, ierr)
+   call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+   call ext_ncd_read_field(fid, DateStr, trim(varname),  &
+                           xfield(1), wrftype,           &
+                           0, 0, 0, ordering,            &
+                           staggering, dimnames,         & !dummy
+                           start_index, end_index,       & !dom
+                           start_index, end_index,       & !mem
+                           start_index, end_index,       & !pat
+                           ierr                   )
+   call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+   base_pres = xfield(1)
+
+   if ( .not. allocated(znw) ) allocate(znw(kz+1))
+   varname = 'ZNW'
+   call ext_ncd_get_var_info (fid, trim(varname), ndim, ordering, staggering,  &
+                              start_index, end_index, wrftype, ierr)
+   call error_handler(ierr, 'ext_ncd_get_var_info '//trim(varname))
+   call ext_ncd_read_field(fid, DateStr, trim(varname),  &
+                           xfield(:), wrftype,           &
+                           0, 0, 0, ordering,            &
+                           staggering, dimnames,         & !dummy
+                           start_index, end_index,       & !dom
+                           start_index, end_index,       & !mem
+                           start_index, end_index,       & !pat
+                           ierr                   )
+   call error_handler(ierr, 'ext_ncd_read_field '//trim(varname))
+   znw(:) = xfield(:)
+
+   call ext_ncd_ioclose(fid, ierr)
+   if ( allocated(xfield) ) deallocate (xfield)
+
+   allocate(p_w(1:kz+1))
+   allocate(ln_p_w(1:kz+1))
+   allocate(dlnp(1:kz,1:kz))
+
+   ! empirical settings
+   cutoff   = 1.0/2.71828 !1/e
+
+   do k = 1, kz+1
+      p_w(k) = znw(k)*(base_pres - p_top) + p_top
+      ln_p_w(k) = log(max(p_w(k),0.0001))
+   end do
+   kcutoff(:) = 1.0  ! initialize
+   do k = 1, kz
+      do k1 = k+1, kz
+         dlnp(k,k1) = abs(ln_p_w(k)-ln_p_w(k1))
+         if ( dlnp(k,k1) <= cutoff ) then
+            kcutoff(k) = k1-k+1
+            !if ( 0.5*(p_w(k)+p_w(k+1)) <= 15000.0 ) then ! 15000.0Pa
+            !   kcutoff(k) = min(kcutoff(k), 1.0)
+            !end if
+         end if
+      end do
+   end do
+
+   ! smoothing
+   !smooth_kcutoff = .false.
+   smooth_kcutoff = .true.
+   if ( smooth_kcutoff ) then
+      allocate(kcutoff_tmp(1:kz))
+      kcutoff_tmp(:) = kcutoff(:)
+      do i = 1, 2 ! two passes
+         do k = 2, kz-1
+            kcutoff_tmp(k)  = kcutoff(k)+ 0.25 * ( kcutoff(k-1) + kcutoff(k+1) -2.0*kcutoff(k) )
+         end do
+         kcutoff(:) = kcutoff_tmp(:)
+      end do
+      deallocate(kcutoff_tmp)
+   end if
+
+   deallocate(dlnp)
+   deallocate(ln_p_w)
+   deallocate(p_w)
+
+   ! specify probability densities:
+   rho(:,:) = 1.0  ! initialize
+   do k = 1, kz
+      kscale = kcutoff(k)
+      kscale_invsq = 1.0 / ( kscale * kscale )
+      do k1 = k, kz
+         kdist = k1 - k
+         rho(k,k1) = exp ( -1.0 * real(kdist * kdist) * kscale_invsq )
+         rho(k1,k) = rho(k,k1)
+      end do
+   end do
+
+end subroutine get_vertloc
+
 subroutine compute_pert
 
    implicit none
@@ -2641,6 +2816,12 @@ subroutine compute_bv_sl
          do b = 1, num_bins2d
             do k1 = 1, nkk
                do k2 = k1+1, nkk ! Symmetry.
+                  if ( vertloc_opt > 0 ) then
+                     bv(k2,k1,b) = bv(k2,k1,b) * vertloc_rho(k2,k1)
+                  end if
+                  !if ( k1 > 40 ) then ! arbitrarily remove negative values near top
+                  !   bv(k2,k1,b) = max(0.0, bv(k2,k1,b))
+                  !end if
                   bv(k1,k2,b) = bv(k2,k1,b)
                end do
             end do

From ab7828a51c101ca870267046001e334ffe56c917 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Wed, 6 May 2020 15:08:39 -0600
Subject: [PATCH 03/13] Bug fix in gen_be_v3 for be_method=ENS with multiple
 valid dates

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index c9bcf39944..892fd10a86 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -781,6 +781,9 @@ subroutine get_info(nc_list_file)
                      end if
                   else
                      icase = icase + 1
+                     iens  = 1
+                     ilist_file(iens,icase) = ifile
+                     exit find_match_loop_ens
                   end if
                end if
             end do find_match_loop_ens

From 806e605b8bab6ef0ddfbaf52769d05d7a7e60fd0 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Thu, 7 May 2020 14:21:37 -0600
Subject: [PATCH 04/13] Add gen_be_v3 time_lag_ens option for be_method=ENS

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 56 ++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index 892fd10a86..7ed81f17ce 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -57,7 +57,7 @@ program gen_be_v3
       do_pert_calc, do_eof_transform, do_slen_calc, slen_opt, stride, yr_cutoff, &
       verbose, level_start, level_end, aux_file, write_be_dat_r8, pert1_read_opt, &
       vertloc_opt
-   namelist /ens_nml/ nens, write_ep, write_ens_stdv, ep_format, nproc_out
+   namelist /ens_nml/ nens, write_ep, write_ens_stdv, ep_format, nproc_out, time_lag_ens
 
    character(len=filename_len) :: nc_list_file  ! the text file that contains a list of netcdf files to process
    character (len=3)           :: be_method     ! "NMC" or "ENS"
@@ -79,6 +79,7 @@ program gen_be_v3
                                      !  2: all members in one file (full-domain) (real*4)
                                      !  3: all members in one file (sub-domain), need to specify nproc_out (real*4)
    integer :: nproc_out              ! number of processors to decompose for ep_format=3
+   logical :: time_lag_ens           ! if calculating statistics for time lagged ensembles
 
    integer :: level_start, level_end ! for do_slen_calc=true. Can be set to be other than 1 and nvert for quick debugging/testing
    integer :: pert1_read_opt         ! how to access pert1 data for compute_bv_sl
@@ -214,6 +215,7 @@ program gen_be_v3
    write_be_dat_r8 = .true.
    pert1_read_opt = 1
    vertloc_opt = 0
+   time_lag_ens = .false.
 
    ! initialize internal options
    write_pert0 = .false.
@@ -280,6 +282,8 @@ program gen_be_v3
    if ( trim(be_method) == 'NMC' ) then
       ! hard-coded nens=2 for NMC application
       nens = 2
+      ! make sure time_lag_ens is false for NMC method
+      time_lag_ens = .false.
    else if ( trim(be_method) == 'ENS' ) then
       if ( nens == 0 ) then
          call error_handler(-1, &
@@ -617,7 +621,11 @@ subroutine get_info(nc_list_file)
       close(iunit)
       allocate (file_init_dates(nfile))
       allocate (file_valid_dates(nfile))
-      allocate (ilist_file(nens,nfile))
+      if ( .not. time_lag_ens ) then
+         allocate (ilist_file(nens,nfile))
+      else
+         allocate (ilist_file(nfile,nfile))
+      end if
       allocate (valid(nfile))
       file_init_dates(:)  = '0000-00-00_00:00:00'
       file_valid_dates(:) = '0000-00-00_00:00:00'
@@ -769,29 +777,43 @@ subroutine get_info(nc_list_file)
          if ( icount == 1 ) then
             iens = 1
             icase = 1
+            ilist_file(iens,icase) = ifile
          else if ( icount > 1 ) then
-            find_match_loop_ens: do ii = ifile-1, 1, -1
-               if ( valid(ii) ) then
-                  if ( file_init_dates(ifile) == file_init_dates(ii) ) then
-                     if ( file_valid_dates(ifile) == file_valid_dates(ii) ) then
-                        iens = iens + 1
-                        ilist_file(iens,icase)   = ifile
-                        if ( ii == 1 ) ilist_file(iens-1,icase) = ii
+            if ( .not. time_lag_ens ) then
+               find_match_loop_ens: do ii = ifile-1, 1, -1
+                  if ( valid(ii) ) then
+                     if ( file_init_dates(ifile) == file_init_dates(ii) ) then
+                        if ( file_valid_dates(ifile) == file_valid_dates(ii) ) then
+                           iens = iens + 1
+                           ilist_file(iens,icase) = ifile
+                           exit find_match_loop_ens
+                        end if
+                     else
+                        icase = icase + 1
+                        iens  = 1
+                        ilist_file(iens,icase) = ifile
                         exit find_match_loop_ens
-                     end if
-                  else
-                     icase = icase + 1
-                     iens  = 1
-                     ilist_file(iens,icase) = ifile
-                     exit find_match_loop_ens
-                  end if
+                     end if ! if same init_date
+                  end if ! valid file
+               end do find_match_loop_ens
+            else ! time_lag_ens
+               ! only check_valid_date for time_lag_ens
+               if ( file_valid_dates(ifile) == file_valid_dates(ilist_file(iens,icase)) ) then
+                  iens = iens + 1
+                  ilist_file(iens,icase) = ifile
                end if
-            end do find_match_loop_ens
+            end if ! time_lag_ens
          end if
       end if
    end do file_loop1
 
    ncase = icase
+   if ( time_lag_ens ) then
+      nens = iens
+      if ( myproc == root ) then
+         write(stdout,'(a,i4,a)') ' --- Resetting nens = ', nens, ' for time_lag_ens ---'
+      end if
+   end if
 
    if ( ncase > 0 .and. nens > 0 ) then
       if ( .not. allocated(filenames) ) allocate(filenames(1:nens,1:ncase))

From 668914368ff0cae0a452837a2283e7d75ea8a626 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Sun, 24 May 2020 19:10:25 -0600
Subject: [PATCH 05/13] A few fixes to gen_be_v3

1. allow the code to compile in either real8 or real4
2. trap improper number of processors

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 45 ++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index 7ed81f17ce..9acd20fa0d 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -309,6 +309,16 @@ program gen_be_v3
 
    if ( myproc == root ) then
       write(stdout,*) 'ncase, nens = ', ncase, nens
+      if ( trim(be_method) == 'ENS' ) then
+         if ( num_procs > nens ) then
+            call error_handler(-1, 'num_procs must be smaller or equal to nens')
+         end if
+      end if
+      if ( trim(be_method) == 'NMC' ) then
+         if ( num_procs > ncase ) then
+            call error_handler(-1, 'num_procs must be smaller or equal to ncase')
+         end if
+      end if
       if ( verbose ) then
          if ( ncase > 0 .and. nens > 0 ) then
             do i = 1, ncase
@@ -410,10 +420,16 @@ program gen_be_v3
    end if
 
    if ( do_slen_calc ) then
+#ifdef DM_PARALLEL
+      call mpi_barrier(mpi_comm_world,ierr)
+#endif
       call compute_bv_sl
    end if
 
    if ( do_slen_calc ) then
+#ifdef DM_PARALLEL
+      call mpi_barrier(mpi_comm_world,ierr)
+#endif
       allocate(sl_smth(nk))
       if ( write_be_dat_r8 ) then
          allocate(r8tmp1d(1:nk))
@@ -1180,6 +1196,7 @@ subroutine compute_pert
    integer :: case_istart, case_iend
    integer :: ivs, ive, ie_indx
    integer :: my_nvar, dest_proc
+   integer :: this_mpi_real
 
    real(r_single), allocatable :: xfield(:,:,:)
    real(r_single), allocatable :: xfield_u(:,:,:)
@@ -2030,6 +2047,14 @@ subroutine compute_pert
             end if
             allocate (locbuf(ni, nj, nk, nens))
 
+#ifdef DM_PARALLEL
+            if ( kind(locbuf) == 4 ) then
+               this_mpi_real = mpi_real
+            else
+               this_mpi_real = mpi_real8
+            end if
+#endif
+
             my_nvar = 0
             var_loop3: do iv = 1, nvar
                if ( .not. do_this_var(iv) ) cycle var_loop3
@@ -2044,7 +2069,7 @@ subroutine compute_pert
 
 #ifdef DM_PARALLEL
                call mpi_reduce(locbuf,globuf,ijk*nens, &
-                               mpi_real,mpi_sum,root,mpi_comm_world,ierr)
+                               this_mpi_real,mpi_sum,root,mpi_comm_world,ierr)
                if ( ierr /= 0 ) then
                   write(stdout, '(a, i3)') 'Error mpi_reduce on proc', myproc
                   call mpi_abort(mpi_comm_world,1,ierr)
@@ -2278,7 +2303,7 @@ subroutine compute_regcoeff_unbalanced
 
    integer :: ic, ie, iv
    integer :: istart_member, iend_member
-   integer :: iunit, ounit
+   integer :: ounit
 
    logical :: got_var2_inv
 
@@ -2286,7 +2311,6 @@ subroutine compute_regcoeff_unbalanced
       write(stdout,'(a)')' ====== Computing regcoeff and unbalanced fields ======'
    end if
 
-   iunit = 23
    ounit = 24
 
    allocate( psi(1:ni,1:nj,1:nk) )
@@ -2699,6 +2723,7 @@ subroutine compute_bv_sl
    integer :: nn
    integer :: nvar_read, ni_read, nj_read, nk_read, nkk
    integer :: k_start, k_end
+   integer :: this_mpi_real
    integer, allocatable:: nr(:), icount(:)
    real, allocatable   :: cov(:,:)
    real, allocatable   :: ml(:), sl(:)
@@ -2714,6 +2739,14 @@ subroutine compute_bv_sl
    inv_nij = 1.0 / real(ni*nj)
    allocate( field(1:ni,1:nj,1:nk) )
 
+#ifdef DM_PARALLEL
+   if ( kind(field) == 4 ) then
+      this_mpi_real = mpi_real
+   else
+      this_mpi_real = mpi_real8
+   end if
+#endif
+
    if ( trim(be_method) == 'NMC' ) then
       istart_member = 1
       iend_member   = 1
@@ -2981,7 +3014,7 @@ subroutine compute_bv_sl
                if ( myproc == MOD((iv-1), num_procs) ) then
                   field(:,:,:) = xdata(iv,ie,ic)%value(:,:,:)
                end if
-               call mpi_bcast(field(:,:,:), ijk, mpi_real, proc_send, mpi_comm_world, ierr )
+               call mpi_bcast(field(:,:,:), ijk, this_mpi_real, proc_send, mpi_comm_world, ierr )
                if ( myproc /= MOD((iv-1), num_procs) ) then
                   xdata(iv,ie,ic)%value(:,:,:) = field(:,:,:)
                end if
@@ -3142,12 +3175,12 @@ subroutine compute_bv_sl
                end if
                sl(k) = hl(1) !hcl-num_bins2d=1
                sl(k) = sl(k) / ds ! convert to grid unit
-               write(stdout,'(a,a,i3,f10.3)') 'ScaleLength: ', varnames(iv), k, sl(k)*ds*0.001
+               write(stdout,'(a,a,i3,f10.3,a)') 'ScaleLength: ', varnames(iv), k, sl(k)*ds*0.001, ' km'
                be_data(iv)%scale_length(k) = sl(k)
             end do k_loop_opt2
 #ifdef DM_PARALLEL
             call mpi_allreduce(sl(:),sl_g(:), nk, &
-                               mpi_real, mpi_sum, mpi_comm_world, ierr)
+                               this_mpi_real, mpi_sum, mpi_comm_world, ierr)
 #else
             sl_g(:) = sl(:)
 #endif

From 45bdb8c2760a94ac31ff5b00e9f1d1dca34d5a74 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Mon, 25 May 2020 10:25:53 -0600
Subject: [PATCH 06/13] speed up gen_be_v3 by distributing some calculation
 among processors

1. move processor index calculation out of subroutine compute_pert
2. distribute the calculation of unbalanced fields and projecting to vertical modes

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 245 ++++++++++++++++++++++++++----------
 1 file changed, 179 insertions(+), 66 deletions(-)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index 9acd20fa0d..43e559565a 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -101,6 +101,11 @@ program gen_be_v3
    character(len=VarNameLen) :: vartmp
    integer :: nvar_all
 
+   integer, allocatable :: istart_ens(:), iend_ens(:)
+   integer, allocatable :: istart_case(:), iend_case(:)
+   integer :: ens_istart, ens_iend
+   integer :: case_istart, case_iend
+
    logical, allocatable :: do_this_var(:)
    integer, allocatable :: var_dim(:)
 
@@ -335,6 +340,33 @@ program gen_be_v3
    ni1 = ni + 1
    nj1 = nj + 1
 
+   ! divide nens among available processors
+   allocate (istart_ens(0:num_procs-1))
+   allocate (iend_ens  (0:num_procs-1))
+   do i = 0, num_procs - 1
+      call para_range(1, nens, num_procs, i, istart_ens(i), iend_ens(i))
+   end do
+
+   ! divide ncase among available processors
+   allocate (istart_case(0:num_procs-1))
+   allocate (iend_case  (0:num_procs-1))
+   do i = 0, num_procs - 1
+      call para_range(1, ncase, num_procs, i, istart_case(i), iend_case(i))
+   end do
+
+   ! loop indices for distributing pert calulcation among processors
+   if ( trim(be_method) == 'ENS' ) then
+      case_istart = 1
+      case_iend   = ncase
+      ens_istart  = istart_ens(myproc)
+      ens_iend    = iend_ens(myproc)
+   else if ( trim(be_method) == 'NMC' ) then
+      case_istart = istart_case(myproc)
+      case_iend   = iend_case(myproc)
+      ens_istart  = 1
+      ens_iend    = 2
+   end if
+
    if ( calc_psi .or. calc_chi ) then
       allocate(mapfac_m(ni,nj))
       allocate(mapfac_u(ni1,nj))
@@ -573,6 +605,11 @@ program gen_be_v3
       if ( allocated(vertloc_kcutoff) ) deallocate(vertloc_kcutoff)
    end if
 
+   deallocate (istart_ens)
+   deallocate (iend_ens  )
+   deallocate (istart_case)
+   deallocate (iend_case  )
+
 if ( myproc == root ) write(stdout,'(a)')' All Done!'
 
 #ifdef DM_PARALLEL
@@ -1187,13 +1224,8 @@ subroutine compute_pert
    character(len=DateStrLen)       :: DateStr
    character(len=512)              :: message
 
-   integer, allocatable :: istart_ens(:), iend_ens(:)
-   integer, allocatable :: istart_case(:), iend_case(:)
-   integer, allocatable :: istart_var(:), iend_var(:)
    integer, allocatable :: iproc_var(:)
 
-   integer :: ens_istart, ens_iend
-   integer :: case_istart, case_iend
    integer :: ivs, ive, ie_indx
    integer :: my_nvar, dest_proc
    integer :: this_mpi_real
@@ -1255,32 +1287,6 @@ subroutine compute_pert
    nk1 = nk + 1
    ijk = ni * nj * nk
 
-   ! divide nens among available processors
-   allocate (istart_ens(0:num_procs-1))
-   allocate (iend_ens  (0:num_procs-1))
-   do i = 0, num_procs - 1
-      call para_range(1, nens, num_procs, i, istart_ens(i), iend_ens(i))
-   end do
-
-   ! divide ncase among available processors
-   allocate (istart_case(0:num_procs-1))
-   allocate (iend_case  (0:num_procs-1))
-   do i = 0, num_procs - 1
-      call para_range(1, ncase, num_procs, i, istart_case(i), iend_case(i))
-   end do
-
-   if ( trim(be_method) == 'ENS' ) then
-      case_istart = 1
-      case_iend   = ncase
-      ens_istart  = istart_ens(myproc)
-      ens_iend    = iend_ens(myproc)
-   else if ( trim(be_method) == 'NMC' ) then
-      case_istart = istart_case(myproc)
-      case_iend   = iend_case(myproc)
-      ens_istart  = 1
-      ens_iend    = nens
-   end if
-
    write(stdout,'(a,i4,a,i4,a,i4)') &
       ' Proc ', myproc, ' will read  ens files ', ens_istart, ' - ', ens_iend
    write(stdout,'(a,i4,a,i4,a,i4)') &
@@ -2244,11 +2250,6 @@ subroutine compute_pert
       end if
    end if ! be_method=NMC
 
-   deallocate (istart_ens)
-   deallocate (iend_ens  )
-   deallocate (istart_case)
-   deallocate (iend_case  )
-
    do ic = case_istart, case_iend
       do ie = ens_istart, ens_iend
          do iv = 1, nvar_all
@@ -2302,8 +2303,11 @@ subroutine compute_regcoeff_unbalanced
    real, allocatable   :: regcoeff3(:,:,:)           ! psi/T regression cooefficient
 
    integer :: ic, ie, iv
-   integer :: istart_member, iend_member
+   integer :: istart_member4cov, iend_member4cov  ! index for covariance loop
+   integer :: istart_member4bal, iend_member4bal  ! index for unbalanced loop
    integer :: ounit
+   integer :: this_mpi_real
+   integer :: proc_send
 
    logical :: got_var2_inv
 
@@ -2318,20 +2322,40 @@ subroutine compute_regcoeff_unbalanced
    allocate( bin_pts2d(1:num_bins2d) )
 
    if ( trim(be_method) == 'NMC' ) then
-      istart_member = 1
-      iend_member   = 1
+      istart_member4cov = 1
+      iend_member4cov   = 1
+      istart_member4bal = 1
+      iend_member4bal   = 1
    else if ( trim(be_method) == 'ENS' ) then
-      istart_member = 1
-      iend_member   = nens
+      istart_member4cov = 1
+      iend_member4cov   = nens
+      istart_member4bal = istart_ens(myproc)
+      iend_member4bal   = iend_ens(myproc)
+   end if
+
+   allocate( regcoeff1(1:num_bins) )
+   allocate( regcoeff2(1:nk,1:num_bins2d) )
+   allocate( regcoeff3(1:nk,1:nk,1:num_bins2d) )
+   regcoeff1 = 0.0
+   regcoeff2 = 0.0
+   regcoeff3 = 0.0
+
+#ifdef DM_PARALLEL
+   if ( kind(regcoeff1) == 4 ) then
+      this_mpi_real = mpi_real
+   else
+      this_mpi_real = mpi_real8
    end if
+#endif
 
-   var_loop: do iv = 1, nvar
+   ! nvar distributed among processors
+   var_loop_regcoef: do iv = 1, nvar
 
-      if ( .not. do_this_var(iv) ) cycle var_loop
-      if ( myproc /= MOD((iv-1), num_procs) ) cycle var_loop
+      if ( .not. do_this_var(iv) ) cycle var_loop_regcoef
+      if ( myproc /= MOD((iv-1), num_procs) ) cycle var_loop_regcoef
       if ( trim(varnames(iv)) /= 'CHI_U'  .and. &
            trim(varnames(iv)) /= 'PSFC_U' .and. &
-           trim(varnames(iv)) /= 'T_U'         ) cycle var_loop
+           trim(varnames(iv)) /= 'T_U'         ) cycle var_loop_regcoef
 
       !write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing regcoeff for variable ', trim(varnames(iv))
 
@@ -2342,6 +2366,7 @@ subroutine compute_regcoeff_unbalanced
          if ( .not. allocated(chi) )    allocate( chi(1:ni,1:nj,1:nk) )
          if ( .not. allocated(covar1) ) allocate( covar1(1:num_bins) )
          if ( .not. allocated(var1) )   allocate( var1(1:num_bins) )
+         chi(:,:,:) = 0.0
          covar1(:) = 0.0
          var1(:) = 0.0
       end if
@@ -2354,11 +2379,13 @@ subroutine compute_regcoeff_unbalanced
             ! this is needed to make get_pert1_data work for ps
             if ( .not. allocated(ps) )     allocate( ps(1:ni,1:nj,1:nk) )
             if ( .not. allocated(covar2) ) allocate( covar2(1:nk,1:num_bins2d) )
+            ps(:,:,:) = 0.0
             covar2(:,:) = 0.0
          end if
          if ( trim(varnames(iv)) == 'T_U' ) then
             if ( .not. allocated(temp) )   allocate( temp(1:ni,1:nj,1:nk) )
             if ( .not. allocated(covar3) ) allocate( covar3(1:nk,1:nk,1:num_bins2d) )
+            temp(:,:,:) = 0.0
             covar3(:,:,:) = 0.0
          end if
       end if
@@ -2366,7 +2393,7 @@ subroutine compute_regcoeff_unbalanced
       write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing psi covariances for variable ', trim(varnames(iv))
 
       do ic = 1, ncase
-         do ie = istart_member, iend_member
+         do ie = istart_member4cov, iend_member4cov
             !write(stdout,'(5a,i4)')'    Processing data for date ', filedates(ie,ic), ', variable ', trim(varnames(iv)), &
             !                  ', member ', ie
 
@@ -2519,7 +2546,6 @@ subroutine compute_regcoeff_unbalanced
       write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing regression coefficients for variable ', trim(varnames(iv))
 
       if ( trim(varnames(iv)) == 'CHI_U' ) then
-         allocate( regcoeff1(1:num_bins) )
          ! psi/chi:
          do b = 1, num_bins
             regcoeff1(b) = covar1(b) / var1(b)
@@ -2534,7 +2560,6 @@ subroutine compute_regcoeff_unbalanced
       end if ! chi_u
 
       if ( trim(varnames(iv)) == 'PSFC_U' ) then
-         allocate( regcoeff2(1:nk,1:num_bins2d) )
          ! psi/ps:
          do b = 1, num_bins2d
             do k = 1, nk
@@ -2555,7 +2580,6 @@ subroutine compute_regcoeff_unbalanced
       end if ! ps_u
 
       if ( trim(varnames(iv)) == 'T_U' ) then
-         allocate( regcoeff3(1:nk,1:nk,1:num_bins2d) )
          ! psi/T:
          do b = 1, num_bins2d
             do k = 1, nk
@@ -2597,10 +2621,52 @@ subroutine compute_regcoeff_unbalanced
          deallocate( covar3 )
       end if
 
-      write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Computing unbalanced field for variable ', trim(varnames(iv))
+   end do var_loop_regcoef
 
-      do ic = 1, ncase
-         do ie = istart_member, iend_member
+#ifdef DM_PARALLEL
+      do iv = 1, nvar
+         if ( .not. do_this_var(iv) ) cycle
+         proc_send = MOD((iv-1), num_procs)
+         if ( trim(varnames(iv)) == 'CHI_U' ) then
+            call mpi_bcast(regcoeff1(:), num_bins, this_mpi_real, proc_send, mpi_comm_world, ierr )
+         end if
+         if ( trim(varnames(iv)) == 'PSFC_U' ) then
+            call mpi_bcast(regcoeff2(:,:), nk*num_bins2d, this_mpi_real, proc_send, mpi_comm_world, ierr )
+         end if
+         if ( trim(varnames(iv)) == 'T_U' ) then
+            call mpi_bcast(regcoeff3(:,:,:), nk*nk*num_bins2d, this_mpi_real, proc_send, mpi_comm_world, ierr )
+         end if
+      end do
+#endif
+
+   var_loop_unbal: do iv = 1, nvar
+
+      if ( .not. do_this_var(iv) ) cycle var_loop_unbal
+      if ( trim(varnames(iv)) /= 'CHI_U'  .and. &
+           trim(varnames(iv)) /= 'PSFC_U' .and. &
+           trim(varnames(iv)) /= 'T_U'         ) cycle var_loop_unbal
+
+      if ( myproc == root ) write(stdout,'(2a)') ' Computing unbalanced field for variable ', trim(varnames(iv))
+
+      if ( trim(varnames(iv)) == 'CHI_U' ) then
+         if ( .not. allocated(chi) )   allocate( chi(1:ni,1:nj,1:nk) )
+         chi(:,:,:) = 0.0
+      end if
+      if ( trim(varnames(iv)) == 'PSFC_U' ) then
+         if ( .not. allocated(ps) )    allocate( ps(1:ni,1:nj,1:nk) )
+         ps(:,:,:) = 0.0
+      end if
+      if ( trim(varnames(iv)) == 'T_U' ) then
+         if ( .not. allocated(temp) )  allocate( temp(1:ni,1:nj,1:nk) )
+         temp(:,:,:) = 0.0
+      end if
+
+      ! ncase/nens distributed among processors
+      do ic = case_istart, case_iend
+         do ie = istart_member4bal, iend_member4bal
+
+            !write(stdout,'(a,i3,4a)') ' Proc', myproc, ' Computing unbalanced field for variable ', trim(varnames(iv)), &
+            !                          ' date ', filedates(ie,ic)
 
             ! psi is needed for all variables
             write(ce,'(i3.3)') ie
@@ -2665,6 +2731,10 @@ subroutine compute_regcoeff_unbalanced
          end do ! ens member loop
       end do ! ncase loop
 
+#ifdef DM_PARALLEL
+      call mpi_barrier(mpi_comm_world,ierr)
+#endif
+
       if ( trim(varnames(iv)) == 'CHI_U' ) then
          if ( allocated(chi) ) deallocate( chi )
       end if
@@ -2675,8 +2745,11 @@ subroutine compute_regcoeff_unbalanced
          if ( allocated(temp) ) deallocate( temp )
       end if
 
-   end do var_loop
+   end do var_loop_unbal
 
+   deallocate( regcoeff1 )
+   deallocate( regcoeff2 )
+   deallocate( regcoeff3 )
    deallocate( psi )
    deallocate( bin_pts )
    deallocate( bin_pts2d )
@@ -2692,7 +2765,8 @@ subroutine compute_bv_sl
    character(len=filename_len)  :: output_file
    integer             :: i, j, k, k1, k2, b         ! Loop counters.
    integer             :: ic, ie, iv, m
-   integer :: istart_member, iend_member
+   integer :: istart_member, iend_member             ! loop index for nens
+   integer :: istart_member_p, iend_member_p         ! loop index for projecting to modes
    real                :: inv_nij                    ! 1 / (ni*nj).
    real                :: mean_field                 ! Mean field.
    real                :: coeffa, coeffb             ! Accumulating mean coefficients.
@@ -2748,11 +2822,15 @@ subroutine compute_bv_sl
 #endif
 
    if ( trim(be_method) == 'NMC' ) then
-      istart_member = 1
-      iend_member   = 1
+      istart_member   = 1
+      iend_member     = 1
+      istart_member_p = 1
+      iend_member_p   = 1
    else if ( trim(be_method) == 'ENS' ) then
-      istart_member = 1
-      iend_member   = nens
+      istart_member   = 1
+      iend_member     = nens
+      istart_member_p = istart_ens(myproc)
+      iend_member_p   = iend_ens(myproc)
    end if
 
    if ( pert1_read_opt == 1 ) then
@@ -2807,10 +2885,11 @@ subroutine compute_bv_sl
          allocate( fieldv(1:ni,1:nj,1:nk) )
       end if
 
-      var_loop: do iv = 1, nvar
+      ! nvar distributed among processors
+      var_loop_bv: do iv = 1, nvar
 
-         if ( .not. do_this_var(iv) ) cycle var_loop
-         if ( myproc /= MOD((iv-1), num_procs) ) cycle var_loop
+         if ( .not. do_this_var(iv) ) cycle var_loop_bv
+         if ( myproc /= MOD((iv-1), num_procs) ) cycle var_loop_bv
 
          write(stdout,'(a,i3,2a)') ' Proc', myproc, ' Processing vertical error stats for variable ', trim(varnames(iv))
 
@@ -2953,10 +3032,40 @@ subroutine compute_bv_sl
             end do
          end do
 
+         !  Output eigenvectors, eigenvalues
+         filename = 'EV_'//trim(varnames(iv))//'.dat'
+         open (ounit, file = filename, form='unformatted')
+         write(ounit)varnames(iv)
+         write(ounit)ni, nj, nkk
+         write(ounit)evec(1:nj,1:nkk,1:nkk)
+         write(ounit)eval(1:nj,1:nkk)
+         close(ounit)
+      end do var_loop_bv
+
+#ifdef DM_PARALLEL
+      call mpi_barrier(mpi_comm_world,ierr)
+#endif
+
+      var_loop_proj: do iv = 1, nvar
+
+         if ( .not. do_this_var(iv) ) cycle var_loop_proj
+
          if ( var_dim(iv) == 3 ) then
+
+            ! raed eigenvectors, eigenvalues
+            filename = 'EV_'//trim(varnames(iv))//'.dat'
+            open (iunit, file = filename, form='unformatted')
+            read(iunit) !varnames(iv)
+            read(iunit) !ni, nj, nkk
+            read(iunit) evec(1:nj,1:nk,1:nk)
+            read(iunit) eval(1:nj,1:nk)
+            close(iunit)
+
             if ( myproc == root ) write(stdout,'(a)')' Projecting fields onto vertical modes'
-            do ic = 1, ncase
-               do ie = istart_member, iend_member
+
+            ! ncase/nens distributed among processors
+            do ic = case_istart, case_iend
+               do ie = istart_member_p, iend_member_p
                   !write(stdout,'(5a,i4)')'    Date = ', filedates(ie,ic), ', variable ', trim(varnames(iv)), &
                   !                  ', member ', ie
                   ! Project fields onto vertical modes
@@ -2999,9 +3108,13 @@ subroutine compute_bv_sl
                   close(ounit)
                 end if ! pert1_read_opt
                end do ! End loop over members.
-            end do
-         end if
-      end do var_loop
+            end do ! case loop
+         end if ! var_dim=3
+      end do var_loop_proj
+
+#ifdef DM_PARALLEL
+      call mpi_barrier(mpi_comm_world,ierr)
+#endif
 
       if ( pert1_read_opt == 1 ) then
 #ifdef DM_PARALLEL

From 7bf86c3850b944a11c964fed2754e58acc5e0575 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Wed, 3 Jun 2020 12:08:17 -0600
Subject: [PATCH 07/13] fixed a couple gen_be_v3 issues revealed by compiling
 in debug mode

1. psi should be interpolated to be on mass grids in order to have consistent dimensions as other variables
2. split checks for do_this_var(1:nvar) when nvar_all > nvar

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index 43e559565a..a1e384826a 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -1237,7 +1237,8 @@ subroutine compute_pert
 
    real, allocatable :: vor(:,:,:)    ! vorticity
    real, allocatable :: div(:,:,:)    ! divergence
-   real, allocatable :: psi(:,:,:)    ! stream function
+   real, allocatable :: psi_s(:,:,:)  ! stream function
+   real, allocatable :: psi(:,:,:)    ! stream function on mass grid
    real, allocatable :: chi(:,:,:)    ! velocity potential
    real, allocatable :: ustag(:,:,:)  ! u on staggered grid
    real, allocatable :: vstag(:,:,:)  ! v on staggered grid
@@ -1626,13 +1627,25 @@ subroutine compute_pert
 
          if ( calc_psi .and. got_u .and. got_v ) then
             if ( .not. allocated(vor) ) allocate (vor(ni+1, nj+1, nk))
-            if ( .not. allocated(psi) ) allocate (psi(ni+1, nj+1, nk))
+            if ( .not. allocated(psi_s) ) allocate (psi_s(ni+1, nj+1, nk))
+            if ( .not. allocated(psi) )   allocate (psi(ni, nj, nk))
             ! Calculate vorticity (in center of mass grid on WRF's Arakawa C-grid)
             call da_uv_to_vor_c(ni, nj, nk, ds, &
                                 mapfac_m, mapfac_u, mapfac_v, ustag, vstag, vor)
             ! Calculate streamfunction
             ! Assumes vor converted to Del**2 psi
-            call da_del2a_to_a(ni+1, nj+1, nk, ds, vor, psi)
+            call da_del2a_to_a(ni+1, nj+1, nk, ds, vor, psi_s)
+
+            ! interpolate psi to mass points
+            do k = 1, nk
+               do j = 1, nj
+                  do i = 1, ni
+                     psi(i,j,k) = 0.25 * ( psi_s(i,j,k) + psi_s(i+1,j,k) +     &
+                                           psi_s(i,j+1,k) + psi_s(i+1,j+1,k) )
+                  end do
+               end do
+            end do
+            if ( allocated(psi_s) ) deallocate (psi_s)
          end if ! calc_psi
 
          if ( calc_chi .and. got_u .and. got_v ) then
@@ -1874,7 +1887,9 @@ subroutine compute_pert
          if ( myproc == root ) write(stdout,'(a)') ' ====== Computing ensemble mean ======'
 
          var_loop2: do iv = 1, nvar_all
-            if ( (iv<=nvar) .and. (.not. do_this_var(iv)) ) cycle var_loop2
+            if ( iv <= nvar ) then
+               if ( .not. do_this_var(iv) ) cycle var_loop2
+            end if
             ens_sum_loc(:,:,:) = 0.0 ! initialize for each variable
             ens_sum    (:,:,:) = 0.0 ! initialize for each variable
             do n = istart_ens(myproc),iend_ens(myproc)
@@ -1906,7 +1921,9 @@ subroutine compute_pert
 
          if ( myproc == root ) write(stdout,'(a)') ' ====== Computing ensemble perturbations ======'
          do iv = 1, nvar_all
-            if ( (iv<=nvar) .and. (.not. do_this_var(iv)) ) cycle
+            if ( iv <= nvar ) then
+               if ( .not. do_this_var(iv) ) cycle
+            end if
             do ie = ens_istart, ens_iend ! each proc loops over a subset of ens
                if ( remove_ens_mean ) then
                   xdata(iv,ie,ic)%value(:,:,:) = xdata(iv,ie,ic)%value(:,:,:) - ens_mean(:,:,:,iv)

From 82c2df6b3e6bd3cb9f0df9d5d1166617e9a46675 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Tue, 21 Jul 2020 16:20:32 -0600
Subject: [PATCH 08/13] Fix precision declaration for regcoeff in
 combine_be_cv5.f90

modified:   var/gen_be_v3/util/combine_be_cv5.f90
---
 var/gen_be_v3/util/combine_be_cv5.f90 | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/var/gen_be_v3/util/combine_be_cv5.f90 b/var/gen_be_v3/util/combine_be_cv5.f90
index 75d9710190..503fedbb5c 100644
--- a/var/gen_be_v3/util/combine_be_cv5.f90
+++ b/var/gen_be_v3/util/combine_be_cv5.f90
@@ -21,12 +21,18 @@ program be_readwrite
    !real,    allocatable  :: e_vec_loc(:,:,:)     ! latitudinally varying eigenvectors
    !real,    allocatable  :: e_val_loc(:,:)       ! latitudinally varying eigenvalues
    !real,    allocatable  :: scale_length(:)      ! scale length for regional application
+   !real,    allocatable  :: regcoeff1(:)         ! psi/chi regcoeff
+   !real,    allocatable  :: regcoeff2(:,:)       ! psi/ps regcoeff
+   !real,    allocatable  :: regcoeff3(:,:,:)     ! psi/t regcoeff
 !for be_[PSI|CHI_U|T_U|RH|PSFC_U].dat in real(8) when gen_be_v3 is run with write_be_dat_r8=.true.(default)
    real*8,  allocatable  :: e_vec(:,:)           ! domain-averaged eigenvectors
    real*8,  allocatable  :: e_val(:)             ! domain-averaged eigenvalues
    real*8,  allocatable  :: e_vec_loc(:,:,:)     ! latitudinally varying eigenvectors
    real*8,  allocatable  :: e_val_loc(:,:)       ! latitudinally varying eigenvalues
    real*8,  allocatable  :: scale_length(:)      ! scale length for regional application
+   real*8,  allocatable  :: regcoeff1(:)         ! psi/chi regcoeff
+   real*8,  allocatable  :: regcoeff2(:,:)       ! psi/ps regcoeff
+   real*8,  allocatable  :: regcoeff3(:,:,:)     ! psi/t regcoeff
 
    real*8,  allocatable  :: e_vec_r8(:,:)        ! domain-averaged eigenvectors
    real*8,  allocatable  :: e_val_r8(:)          ! domain-averaged eigenvalues
@@ -34,9 +40,6 @@ program be_readwrite
    real*8,  allocatable  :: e_val_loc_r8(:,:)    ! latitudinally varying eigenvalues
    real*8,  allocatable  :: scale_length_r8(:)   ! scale length for regional application
 
-   real,    allocatable  :: regcoeff1(:)         ! psi/chi regcoeff
-   real,    allocatable  :: regcoeff2(:,:)       ! psi/ps regcoeff
-   real,    allocatable  :: regcoeff3(:,:,:)     ! psi/t regcoeff
    real*8,  allocatable  :: regcoeff1_r8(:)      ! psi/chi regcoeff
    real*8,  allocatable  :: regcoeff2_r8(:,:)    ! psi/ps regcoeff
    real*8,  allocatable  :: regcoeff3_r8(:,:,:)  ! psi/t regcoeff

From a02a872181d795c48737fc67883d6540281f06ba Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Sun, 23 Aug 2020 16:56:03 -0600
Subject: [PATCH 09/13] Add an option (es_ice) to consider ice effect for
 saturation vapor pressure.

modified:   var/gen_be_v3/gen_be_v3.F90
---
 var/gen_be_v3/gen_be_v3.F90 | 80 ++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 5 deletions(-)

diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index a1e384826a..e8ee4e7a6e 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -49,6 +49,7 @@ program gen_be_v3
    real,    parameter :: cp = 7.0*gas_constant/2.0
    real,    parameter :: kappa = gas_constant/cp
    real,    parameter :: t_kelvin = 273.15
+   real,    parameter :: t_triple = 273.16 ! triple point of water
    real,    parameter :: gas_constant_v = 461.6
    real,    parameter :: rd_over_rv = gas_constant/gas_constant_v
    real,    parameter :: rd_over_rv1 = 1.0 - rd_over_rv
@@ -56,7 +57,7 @@ program gen_be_v3
    namelist /gen_be_nml/ nc_list_file, be_method, varnames, outnames, &
       do_pert_calc, do_eof_transform, do_slen_calc, slen_opt, stride, yr_cutoff, &
       verbose, level_start, level_end, aux_file, write_be_dat_r8, pert1_read_opt, &
-      vertloc_opt
+      vertloc_opt, es_ice
    namelist /ens_nml/ nens, write_ep, write_ens_stdv, ep_format, nproc_out, time_lag_ens
 
    character(len=filename_len) :: nc_list_file  ! the text file that contains a list of netcdf files to process
@@ -71,6 +72,7 @@ program gen_be_v3
    integer :: stride                 ! for slen_opt=1 to skip every stride grid point
    real    :: yr_cutoff(nvar_max)    ! for slen_opt=1
    logical :: verbose
+   logical :: es_ice                 ! whether to consider ice effect or not for saturation vapor pressure
    integer :: nens                   ! set in namelist for ENS method. hard-coded to 2 for NMC method.
    logical :: write_ep               ! if writing out ensemble perturbations
    logical :: write_ens_stdv         ! if writing out stdv of ensemble perturbations
@@ -216,6 +218,7 @@ program gen_be_v3
    ep_format = 2
    nproc_out = 1
    verbose = .false.
+   es_ice = .false.
    aux_file = 'none'
    write_be_dat_r8 = .true.
    pert1_read_opt = 1
@@ -1252,7 +1255,7 @@ subroutine compute_pert
    real, allocatable :: mub(:,:)
    real, allocatable :: znw(:)
    real    :: p_top
-   real    :: tc, es, qs
+   real    :: tk, es, qs
    logical :: got_prs, got_th, got_qv, got_u, got_v, got_psfc
 
    real, allocatable :: time_mean(:,:,:,:)
@@ -1756,9 +1759,11 @@ subroutine compute_pert
                do k = 1, nk
                   do j = 1, nj
                      do i = 1, ni
-                        tc = (theta(i,j,k)*(prs(i,j,k)/p00)**kappa) - t_kelvin
-                        es = 611.2 * exp( 17.67 * tc / (tc + 243.5) )
-                        qs = rd_over_rv * es /( prs(i,j,k) - rd_over_rv1 * es)
+                        !tc = (theta(i,j,k)*(prs(i,j,k)/p00)**kappa) - t_kelvin
+                        !es = 611.2 * exp( 17.67 * tc / (tc + 243.5) )
+                        !qs = rd_over_rv * es /( prs(i,j,k) - rd_over_rv1 * es)
+                        tk = theta(i,j,k)*(prs(i,j,k)/p00)**kappa
+                        call calc_es_qs(tk, prs(i,j,k), es, qs, es_ice)
                         ! get RH is ratio, not percentage
                         xdata(iv,ie,ic)%value(i,j,k) = (q(i,j,k)/(1.0+q(i,j,k))) / qs
                      end do
@@ -3924,6 +3929,71 @@ subroutine get_pert1_data(pert_file, vname, nx, ny, nz, vdata)
 
 end subroutine get_pert1_data
 
+subroutine calc_es_qs (t, p, es, qs, wrt_ice)
+
+! Purpose: calculate saturation vapor pressure and saturation specific humidity
+!          given temperature and pressure
+
+   implicit none
+
+   real, intent(in)  :: t, p
+   real, intent(out) :: es, qs
+   logical, intent(in), optional :: wrt_ice
+
+   real, parameter   :: es_alpha = 611.2
+   real, parameter   :: es_beta  = 17.67
+   real, parameter   :: es_gamma = 243.5
+   real, parameter   :: t_c_ref1 =   0.0   ! C
+   real, parameter   :: t_c_ref2 = -20.0   ! C
+   real, parameter   :: a0 = 6.107799961
+   real, parameter   :: a1 = 4.436518521e-01
+   real, parameter   :: a2 = 1.428945805e-02
+   real, parameter   :: a3 = 2.650648471e-04
+   real, parameter   :: a4 = 3.031240396e-06
+   real, parameter   :: a5 = 2.034080948e-08
+   real, parameter   :: a6 = 6.136820929e-11
+   real, parameter   :: c1 = 9.09718
+   real, parameter   :: c2 = 3.56654
+   real, parameter   :: c3 = 0.876793
+   real, parameter   :: c4 = 6.1071
+   real              :: t_c, t1_c          ! T in degree C
+   logical           :: ice
+
+   ice = .false.
+   if ( present(wrt_ice) ) then
+      if ( wrt_ice ) ice = .true.
+   end if
+
+   t_c  = t - t_kelvin
+   t1_c = t - t_triple
+
+   ! Calculate saturation vapor pressure es
+
+   if ( .not. ice ) then   ! over water only
+
+      es = es_alpha * exp( es_beta * t_c / ( t_c + es_gamma ) )
+
+   else   ! consider ice-water and ice effects
+
+      if ( t1_c > t_c_ref1 ) then   ! vapor pressure over water
+         es = es_alpha * exp( es_beta * t_c / ( t_c + es_gamma ) )
+      else if ( (t1_c <= t_c_ref1) .and. (t1_c >= t_c_ref2) ) then   ! vapor pressure over water below 0C
+         es = a0 + t1_c * (a1 + t1_c * (a2 + t1_c * (a3 + t1_c * (a4 + t1_c * (a5 + t1_c * a6)))))
+         es = es * 100.0  ! to Pa
+      else   ! vapor pressure over ice
+         es = 10.0 ** ( -c1 * (t_triple / t - 1.0) - c2 * alog10(t_triple / t) +  &
+                         c3 * (1.0 - t / t_triple) +      alog10(c4))
+         es = es * 100.0  ! to Pa
+      end if
+
+   end if
+
+   ! Calculate saturation specific humidity qs
+
+   qs = rd_over_rv * es / ( p - rd_over_rv1 * es )
+
+end subroutine calc_es_qs
+
 end program gen_be_v3
 
 subroutine para_range(n1, n2, nprocs, myrank, ista, iend)

From a85a14c128f13a07791146f3d359737ef44d847c Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Sun, 23 Aug 2020 17:28:51 -0600
Subject: [PATCH 10/13] Add makefile and use lapack lib compiled in WRFDA

modified:   var/gen_be_v3/README.gen_be_v3
deleted:    var/gen_be_v3/compile_casper
deleted:    var/gen_be_v3/compile_cheyenne
modified:   var/gen_be_v3/gen_be_v3.F90
new file:   var/gen_be_v3/makefile
---
 var/gen_be_v3/README.gen_be_v3 | 12 +++++++++-
 var/gen_be_v3/compile_casper   | 44 ----------------------------------
 var/gen_be_v3/compile_cheyenne | 44 ----------------------------------
 var/gen_be_v3/gen_be_v3.F90    |  4 +++-
 var/gen_be_v3/makefile         | 40 +++++++++++++++++++++++++++++++
 5 files changed, 54 insertions(+), 90 deletions(-)
 delete mode 100755 var/gen_be_v3/compile_casper
 delete mode 100755 var/gen_be_v3/compile_cheyenne
 create mode 100644 var/gen_be_v3/makefile

diff --git a/var/gen_be_v3/README.gen_be_v3 b/var/gen_be_v3/README.gen_be_v3
index e552e263f8..53e14fd50e 100644
--- a/var/gen_be_v3/README.gen_be_v3
+++ b/var/gen_be_v3/README.gen_be_v3
@@ -4,7 +4,17 @@
 
 1. To compile:
 
-      See the sample compile_cheyenne/compile_casper and the info in gen_be_v3.F90
+   Option 1: use lapack lib compiled in WRFDA
+      (1) need to compile WRFDA first,
+            cd your_WRFDA_dir
+            ./clean -a
+            ./configure wrfda
+            ./compile all_wrfvar
+      (2) cd your_WRFDA_dir/var/gen_be_v3
+          edit makefile
+          make
+
+   Option 2: use general lapack lib (see the desciption in gen_be_v3.F90)
 
 2. To run:
 
diff --git a/var/gen_be_v3/compile_casper b/var/gen_be_v3/compile_casper
deleted file mode 100755
index 1c698bfa9b..0000000000
--- a/var/gen_be_v3/compile_casper
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/csh
-#set echo
-
-set MACHINE = `hostname -s`
-
-if ( `echo $MACHINE|cut -c1-6` == casper ) then #NCAR casper DAV
-
-   # WRFIO_LIB = WRF/external/io_netcdf/libwrfio_nf.a
-   set WRFIO_LIB = /gpfs/u/home/hclin/extlib/intel/libwrfio_nf.a
-   if ( ! $?WRFIO_LIB ) then
-      echo "environment variable WRFIO_LIB not set"
-      exit 1
-   endif
-
-   # LAPACK library for EOF decomposition
-   set LAPACK_LIB = /glade/u/apps/opt/intel/2017u1/mkl/lib/intel64_lin/libmkl_lapack95_lp64.a
-   if ( ! $?LAPACK_LIB ) then
-      echo "environment variable LAPACK_LIB not set"
-      exit 1
-   endif
-
-   module purge
-   module load ncarenv
-   module load intel
-   module load ncarcompilers
-   module load netcdf
-   module load mkl
-   module load openmpi
-   module list
-
-set echo
-
-   #MPI/OMP
-   cpp -P -traditional -DDM_PARALLEL gen_be_v3.F90 > be.f90
-   mpif90 be.f90 -o gen_be_v3_mpp.exe -qopenmp -convert big_endian ${WRFIO_LIB} -L${NETCDF}/lib -lnetcdf -lnetcdff -lm -I${NETCDF}/include ${LAPACK_LIB}
-
-   #serial/OMP
-   cpp -P -traditional gen_be_v3.F90 > be.f90
-   ifort be.f90 -o gen_be_v3.exe -qopenmp -convert big_endian ${WRFIO_LIB} -L${NETCDF}/lib -lnetcdf -lnetcdff -lm -I${NETCDF}/include ${LAPACK_LIB}
-
-unset echo
-
-   \rm -f be.f90
-endif
diff --git a/var/gen_be_v3/compile_cheyenne b/var/gen_be_v3/compile_cheyenne
deleted file mode 100755
index bed06ab65d..0000000000
--- a/var/gen_be_v3/compile_cheyenne
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/csh
-#set echo
-
-set MACHINE = `hostname -s`
-
-if ( `echo $MACHINE|cut -c1-8` == cheyenne ) then #NCAR HPC
-
-   # WRFIO_LIB = WRF/external/io_netcdf/libwrfio_nf.a
-   set WRFIO_LIB = /gpfs/u/home/hclin/extlib/intel/libwrfio_nf.a
-   if ( ! $?WRFIO_LIB ) then
-      echo "environment variable WRFIO_LIB not set"
-      exit 1
-   endif
-
-   # LAPACK library for EOF decomposition
-   set LAPACK_LIB = /glade/u/apps/opt/intel/2017u1/compilers_and_libraries/linux/mkl/lib/intel64_lin/libmkl_lapack95_lp64.a
-   if ( ! $?LAPACK_LIB ) then
-      echo "environment variable LAPACK_LIB not set"
-      exit 1
-   endif
-
-   module purge
-   module load ncarenv
-   module load intel
-   module load ncarcompilers
-   module load netcdf
-   module load mkl
-   module load mpt
-   module list
-
-set echo
-
-   #MPI/OMP
-   cpp -P -traditional -DDM_PARALLEL gen_be_v3.F90 > be.f90
-   mpif90 be.f90 -o gen_be_v3_mpp.exe -qopenmp -convert big_endian ${WRFIO_LIB} -L${NETCDF}/lib -lnetcdf -lnetcdff -lm -I${NETCDF}/include ${LAPACK_LIB}
-
-   #serial/OMP
-   cpp -P -traditional gen_be_v3.F90 > be.f90
-   ifort be.f90 -o gen_be_v3.exe -qopenmp -convert big_endian ${WRFIO_LIB} -L${NETCDF}/lib -lnetcdf -lnetcdff -lm -I${NETCDF}/include ${LAPACK_LIB}
-
-unset echo
-
-   \rm -f be.f90
-endif
diff --git a/var/gen_be_v3/gen_be_v3.F90 b/var/gen_be_v3/gen_be_v3.F90
index e8ee4e7a6e..b81db79090 100644
--- a/var/gen_be_v3/gen_be_v3.F90
+++ b/var/gen_be_v3/gen_be_v3.F90
@@ -2780,7 +2780,6 @@ end subroutine compute_regcoeff_unbalanced
 
 subroutine compute_bv_sl
 
-   !use da_lapack, only : dsyev
    implicit none
 
    character(len=filename_len)  :: filename                   ! Input filename.
@@ -3637,6 +3636,9 @@ subroutine da_eof_decomposition (kz, bx, e, l)
    !          BE field.
    !---------------------------------------------------------------------------
 
+#ifdef USE_WRFDA
+   use da_lapack, only : dsyev
+#endif
    implicit none
 
    integer, intent(in)  :: kz               ! Dimension of error matrix.
diff --git a/var/gen_be_v3/makefile b/var/gen_be_v3/makefile
new file mode 100644
index 0000000000..8c48cc0ff6
--- /dev/null
+++ b/var/gen_be_v3/makefile
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+#serial
+FC = gfortran
+CPPFLAG = -DUSE_WRFDA
+
+#mpp
+#FC = mpif90
+#CPPFLAG = -DUSE_WRFDA -DDM_PARALLEL
+
+#openmp (optional, used only for slen_opt=1)
+OMP_FLAG = #-fopenmp
+OMP_LIB  = #-L/usr/local/lib -lgfortran -lgomp
+
+PROMOTION  = #-fdefault-real-8
+FCBASEOPTS = -ffree-form -fconvert=big-endian #-ffree-line-length-none
+FCDEBUG    = #-g -O0 -fbacktrace -ggdb -fcheck=bounds,do,mem,pointer -ffpe-trap=invalid,zero,overflow
+FFLAGS     = $(PROMOTION) $(FCDEBUG) $(FCBASEOPTS) $(OMP_FLAG)
+CPP        = cpp -P -traditional
+
+WRF_SRC_ROOT_DIR = ../..
+LIBS = -L$(WRF_SRC_ROOT_DIR)/external/io_netcdf -lwrfio_nf -L$(WRF_SRC_ROOT_DIR)/var/build -lwrfvar -L$(NETCDF)/lib -lnetcdff -lnetcdf
+INCS = -I$(WRF_SRC_ROOT_DIR)/var/build
+
+OBJS = gen_be_v3.o
+
+gen_be_v3: ${OBJS}
+	${FC} -o gen_be_v3.exe ${FFLAGS} ${OBJS} ${LIBS} $(OMP_LIB)
+
+.SUFFIXES : .F90 .f .o
+
+.F90.f :
+	$(RM) $@
+	$(CPP) $(CPPFLAG) $*.F90 > $@
+
+.f.o :
+	${FC} ${FFLAGS} ${INCS} -c $*.f
+
+clean:
+	rm -f *.o *.exe

From d367c2d726edf204395796beabc69c7e403fc834 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Mon, 24 Aug 2020 14:13:20 -0600
Subject: [PATCH 11/13] Add makefile for gen_be_v3/util programs

new file:   var/gen_be_v3/util/makefile
---
 var/gen_be_v3/util/makefile | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 var/gen_be_v3/util/makefile

diff --git a/var/gen_be_v3/util/makefile b/var/gen_be_v3/util/makefile
new file mode 100644
index 0000000000..37eb30dc94
--- /dev/null
+++ b/var/gen_be_v3/util/makefile
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+FC     = gfortran
+FFLAGS = -fconvert=big-endian
+
+all: combine_be_cv5.exe combine_be_cv7.exe combine_be_cv7_ccv2.exe
+
+combine_be_cv5.exe: combine_be_cv5.o
+	rm -f $@
+	${FC} -o $@ $(FFLAGS) combine_be_cv5.o
+
+combine_be_cv7.exe: combine_be_cv7.o
+	rm -f $@
+	${FC} -o $@ $(FFLAGS) combine_be_cv7.o
+
+combine_be_cv7_ccv2.exe: combine_be_cv7_ccv2.o
+	rm -f $@
+	${FC} -o $@ $(FFLAGS) combine_be_cv7_ccv2.o
+
+.SUFFIXES : .f90 .o
+
+.f90.o :
+	$(FC) $(FFLAGS) -c $*.f90
+
+clean:
+	rm -f *.o *.exe

From 755c227c678aa97d36110fc0d66de061c3491712 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Mon, 24 Aug 2020 14:40:36 -0600
Subject: [PATCH 12/13] Change sample gen_be_v3 makefile to real-8 compilation

modified:   var/gen_be_v3/makefile
---
 var/gen_be_v3/makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/var/gen_be_v3/makefile b/var/gen_be_v3/makefile
index 8c48cc0ff6..5f185bf549 100644
--- a/var/gen_be_v3/makefile
+++ b/var/gen_be_v3/makefile
@@ -12,7 +12,8 @@ CPPFLAG = -DUSE_WRFDA
 OMP_FLAG = #-fopenmp
 OMP_LIB  = #-L/usr/local/lib -lgfortran -lgomp
 
-PROMOTION  = #-fdefault-real-8
+#real-4 also works, but real-8 is recommended if memory is not an issue
+PROMOTION  = -fdefault-real-8
 FCBASEOPTS = -ffree-form -fconvert=big-endian #-ffree-line-length-none
 FCDEBUG    = #-g -O0 -fbacktrace -ggdb -fcheck=bounds,do,mem,pointer -ffpe-trap=invalid,zero,overflow
 FFLAGS     = $(PROMOTION) $(FCDEBUG) $(FCBASEOPTS) $(OMP_FLAG)

From c43d3361057d35c2b19dd283b5fbfe05c0c66c06 Mon Sep 17 00:00:00 2001
From: Jamie Bresch <hclin@ucar.edu>
Date: Mon, 24 Aug 2020 17:28:08 -0600
Subject: [PATCH 13/13] Change sample makefile back to real-4 and add some
 notes about it

modified:   var/gen_be_v3/README.gen_be_v3
modified:   var/gen_be_v3/makefile
---
 var/gen_be_v3/README.gen_be_v3 | 5 +++++
 var/gen_be_v3/makefile         | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/var/gen_be_v3/README.gen_be_v3 b/var/gen_be_v3/README.gen_be_v3
index 53e14fd50e..9121c16331 100644
--- a/var/gen_be_v3/README.gen_be_v3
+++ b/var/gen_be_v3/README.gen_be_v3
@@ -16,6 +16,11 @@
 
    Option 2: use general lapack lib (see the desciption in gen_be_v3.F90)
 
+ Note about precision:
+   For write_ep = .true. application, the code should be compiled with real-4.
+   For gen_be applications, especially cv_options=5, real-4 works, but real-8 is recommended if memory is not an issue.
+   Hopefully the gen_be_v3 code will be updated in the future to better handle the variable declaration.
+
 2. To run:
 
       (1) Edit namelist.gen_be_v3 (examples are shown below)
diff --git a/var/gen_be_v3/makefile b/var/gen_be_v3/makefile
index 5f185bf549..2a19536f49 100644
--- a/var/gen_be_v3/makefile
+++ b/var/gen_be_v3/makefile
@@ -12,8 +12,9 @@ CPPFLAG = -DUSE_WRFDA
 OMP_FLAG = #-fopenmp
 OMP_LIB  = #-L/usr/local/lib -lgfortran -lgomp
 
-#real-4 also works, but real-8 is recommended if memory is not an issue
-PROMOTION  = -fdefault-real-8
+#for write_ep = .true. application, the code should be compiled with real-4
+#for gen_be applications, especially cv_options=5, real-4 works, but real-8 is recommended if memory is not an issue
+PROMOTION  = #-fdefault-real-8
 FCBASEOPTS = -ffree-form -fconvert=big-endian #-ffree-line-length-none
 FCDEBUG    = #-g -O0 -fbacktrace -ggdb -fcheck=bounds,do,mem,pointer -ffpe-trap=invalid,zero,overflow
 FFLAGS     = $(PROMOTION) $(FCDEBUG) $(FCBASEOPTS) $(OMP_FLAG)