From 529b6ea6b47624731d72610c388171b778dd3786 Mon Sep 17 00:00:00 2001 From: Ming Hu Date: Tue, 7 May 2024 13:25:38 -0400 Subject: [PATCH 01/12] Bring read_radar bug fix (PR 738) to RRFS release branch. (#744) This PR is to bring read_radar bug fix (https://github.com/NOAA-EMC/GSI/pull/738) committed in the develop branch to RRFS release branch. --- src/gsi/read_radar.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gsi/read_radar.f90 b/src/gsi/read_radar.f90 index a824bbbe4..84a4f4fbc 100644 --- a/src/gsi/read_radar.f90 +++ b/src/gsi/read_radar.f90 @@ -907,6 +907,7 @@ subroutine read_radar(nread,ndata,nodata,infile,lunout,obstype,twind,sis,hgtl_fu end do superobs close(lnbufr) ! A simple unformatted fortran file should not be mixed with a bufr I/O + nread=nsuper2_kept LEVEL_TWO_READ_2: if(loop==0 .and. sis=='l2rw') then write(6,*)'READ_RADAR: ',trim(outmessage),' reached eof on 2/2.5/3 superob radar file' @@ -2176,7 +2177,6 @@ subroutine read_radar(nread,ndata,nodata,infile,lunout,obstype,twind,sis,hgtl_fu ibadstaheight=0 notgood=0 notgood0=0 - nread=0 ntdrvr_in=0 ntdrvr_kept=0 ntdrvr_thin1=0 @@ -2522,7 +2522,7 @@ subroutine read_radar(nread,ndata,nodata,infile,lunout,obstype,twind,sis,hgtl_fu end do ! end of loop, reading TDR so data files close(lnbufr) - else + else if (trim(infile) == 'tldplrbufr' ) then nswptype=0 nmrecs=0 From d0facd72fe15661b563b1438ae3ea1f8b009a409 Mon Sep 17 00:00:00 2001 From: Shun Liu Date: Mon, 2 Dec 2024 21:08:24 +0000 Subject: [PATCH 02/12] add GDIT's performance optimizaiton for RRFS --- modulefiles/gsi_wcoss2.intel.lua | 6 +- src/gsi/CMakeLists.txt | 2 + src/gsi/cplr_get_fv3_regional_ensperts.f90 | 47 +- src/gsi/get_gefs_for_regional.f90 | 13 + src/gsi/gsi_files.cmake | 2 + src/gsi/mod_fv3_lola.f90 | 975 +++++++++++++-------- src/gsi/pcgsoi.f90 | 12 +- 7 files changed, 685 insertions(+), 372 deletions(-) diff --git a/modulefiles/gsi_wcoss2.intel.lua b/modulefiles/gsi_wcoss2.intel.lua index c3bfd1156..116baedb9 100644 --- a/modulefiles/gsi_wcoss2.intel.lua +++ b/modulefiles/gsi_wcoss2.intel.lua @@ -9,7 +9,9 @@ local cmake_ver= os.getenv("cmake_ver") or "3.20.2" local python_ver=os.getenv("python_ver") or "3.8.6" local prod_util_ver=os.getenv("prod_util_ver") or "2.0.10" -local netcdf_ver=os.getenv("netcdf_ver") or "4.7.4" +local zlib_ver=os.getenv("zlib_ver") or "1.2.11" +local hdf5_ver=os.getenv("hdf5_ver") or "1.14.0" +local netcdf_ver=os.getenv("netcdf_ver") or "4.9.2" local bufr_ver=os.getenv("bufr_ver") or "11.7.0" local bacio_ver=os.getenv("bacio_ver") or "2.4.1" local w3emc_ver=os.getenv("w3emc_ver") or "2.9.2" @@ -32,6 +34,8 @@ load(pathJoin("python", python_ver)) load(pathJoin("prod_util", prod_util_ver)) +load(pathJoin("zlib", zlib_ver)) +load(pathJoin("hdf5", hdf5_ver)) load(pathJoin("netcdf", netcdf_ver)) load(pathJoin("bufr", bufr_ver)) load(pathJoin("bacio", bacio_ver)) diff --git a/src/gsi/CMakeLists.txt b/src/gsi/CMakeLists.txt index f894b0a8a..d7a02ed99 100644 --- a/src/gsi/CMakeLists.txt +++ b/src/gsi/CMakeLists.txt @@ -62,6 +62,7 @@ find_package(NetCDF REQUIRED Fortran) if(OPENMP) find_package(OpenMP REQUIRED) endif() +find_package(ZLIB REQUIRED) # NCEPLibs dependencies find_package(bacio REQUIRED) @@ -157,6 +158,7 @@ target_link_libraries(gsi_fortran_obj PUBLIC w3emc::w3emc_d) target_link_libraries(gsi_fortran_obj PUBLIC sp::sp_d) target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_d) target_link_libraries(gsi_fortran_obj PUBLIC crtm::crtm) +target_link_libraries(gsi_fortran_obj PUBLIC ZLIB::ZLIB) if(GSI_MODE MATCHES "Regional") target_link_libraries(gsi_fortran_obj PUBLIC wrf_io::wrf_io) endif() diff --git a/src/gsi/cplr_get_fv3_regional_ensperts.f90 b/src/gsi/cplr_get_fv3_regional_ensperts.f90 index 9b841f012..55aa41029 100644 --- a/src/gsi/cplr_get_fv3_regional_ensperts.f90 +++ b/src/gsi/cplr_get_fv3_regional_ensperts.f90 @@ -76,6 +76,7 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) use netcdf_mod , only: nc_check use gsi_rfv3io_mod, only: fv3lam_io_phymetvars3d_nouv use obsmod, only: if_model_dbz,if_model_fed + use mpi, only : MPI_Wtime, MPI_REAL8, MPI_SUCCESS, MPI_MAX implicit none @@ -119,6 +120,8 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) integer(i_kind):: imem_start,n_fv3sar integer(i_kind):: i_caseflag + real(kind=8) :: time_beg,time_end,walltime, tb,te,wt + integer(i_kind) :: ierr if(n_ens/=(n_ens_gfs+n_ens_fv3sar)) then write(6,*)'wrong, the sum of n_ens_gfs and n_ens_fv3sar not equal n_ens, stop' @@ -275,7 +278,7 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) end if end if - + tb=MPI_Wtime() do m=1,ntlevs_ens @@ -452,7 +455,8 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) if( .not. parallelization_over_ensmembers )then if (mype == 0) write(6,'(a,a)') & 'CALL READ_FV3_REGIONAL_ENSPERTS FOR ENS DATA with the filename str : ',trim(ensfilenam_str) - + + time_beg=MPI_Wtime() select case (i_caseflag) case (0) call this%general_read_fv3_regional(fv3_filename,ps,u,v,tv,rh,oz) @@ -469,9 +473,15 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) call this%general_read_fv3_regional(fv3_filename,ps,u,v,tv,rh,oz, & g_ql=ql,g_qi=qi,g_qr=qr,g_qs=qs,g_qg=qg,g_qnr=qnr,g_w=w,g_fed=fed) case (5) + !write(6,'("get_fv3_regional_ensperts_run: Before general_read_fv3_regional")') call this%general_read_fv3_regional(fv3_filename,ps,u,v,tv,rh,oz, & g_ql=ql,g_qi=qi,g_qr=qr,g_qs=qs,g_qg=qg,g_qnr=qnr,g_w=w,g_dbz=dbz,g_fed=fed) end select + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime for general_read_fv3_regional" f15.4,I4)') walltime,i_caseflag + end if if( parallelization_over_ensmembers )then @@ -792,6 +802,11 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) end do enddo ! it 4d loop + te=MPI_Wtime() + call MPI_Reduce(te-tb, wt, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime to read ",I4," ensemble members ", f15.4)') n_ens_fv3sar,wt + ! CALCULATE ENSEMBLE SPREAD if(write_ens_sprd ) then call this%ens_spread_dualres_regional(mype,en_perts,nelen) @@ -851,7 +866,7 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g use hybrid_ensemble_parameters, only: grd_ens,q_hyb_ens use hybrid_ensemble_parameters, only: fv3sar_ensemble_opt,dual_res - use mpimod, only: mpi_comm_world,mpi_rtype + use mpimod, only: mpi_comm_world,mpi_rtype,mype use gsi_rfv3io_mod,only: type_fv3regfilenameg use gsi_rfv3io_mod,only:n2d use constants, only: half,zero @@ -870,6 +885,7 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g use directDA_radaruse_mod, only: l_use_cvpqx, cvpqx_pval, cld_nt_updt use directDA_radaruse_mod, only: l_cvpnr, cvpnr_pval use obsmod, only:if_model_dbz,if_model_fed + use mpi, only : MPI_Wtime, MPI_REAL8, MPI_MAX, MPI_SUCCESS implicit none @@ -913,6 +929,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g character(len=:),allocatable :: sfcdata !='fv3_sfcdata' character(len=:),allocatable :: couplerres!='coupler.res' integer (i_kind) ier,istatus + real(kind=8) :: time_beg,time_end,walltime + integer(i_kind) :: ierr associate( this => this ) ! eliminates warning for unused dummy argument needed for binding @@ -930,6 +948,7 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g couplerres=fv3_filenameginput%couplerres + !write(6,'("general_read_fv3_regional: fv3sar_ensemble_opt= " I4)') fv3sar_ensemble_opt if (allocated(fv3lam_ens_io_dynmetvars2d_nouv) ) then @@ -956,16 +975,28 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g end if - if(fv3sar_ensemble_opt == 0 ) then + if(fv3sar_ensemble_opt == 0 ) then + time_beg=MPI_Wtime() call gsi_fv3ncdf_readuv(grd_fv3lam_ens_uv,g_u,g_v,fv3_filenameginput,dual_res) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("general_read_fv3_regional: Maximum Walltime for gsi_fv3ncdf_readuv" f15.4)') walltime + else call gsi_fv3ncdf_readuv_v1(grd_fv3lam_ens_uv,g_u,g_v,fv3_filenameginput,dual_res) endif if(fv3sar_ensemble_opt == 0) then + time_beg=MPI_Wtime() call gsi_fv3ncdf_read(grd_fv3lam_ens_dynvar_io_nouv,gsibundle_fv3lam_ens_dynvar_nouv,& fv3_filenameginput%dynvars,fv3_filenameginput,dual_res) call gsi_fv3ncdf_read(grd_fv3lam_ens_tracer_io_nouv,gsibundle_fv3lam_ens_tracer_nouv,& fv3_filenameginput%tracers,fv3_filenameginput,dual_res) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("general_read_fv3_regional: Maximum Walltime for gsi_fv3ncdf_read" f15.4)') walltime + if( if_model_dbz .or. if_model_fed ) then call gsi_fv3ncdf_read(grd_fv3lam_ens_phyvar_io_nouv,gsibundle_fv3lam_ens_phyvar_nouv,& fv3_filenameginput%phyvars,fv3_filenameginput,dual_res) @@ -1013,6 +1044,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g endif !! tsen2tv !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + !$omp parallel do default(none),private(k,i,j) & + !$omp shared(grd_ens,g_q,g_tsen,g_tv,fv) do k=1,grd_ens%nsig do j=1,grd_ens%lon2 do i=1,grd_ens%lat2 @@ -1023,6 +1056,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g if (.not.q_hyb_ens) then ice=.true. iderivative=0 + !$omp parallel do default(none),private(k,i,j,kp) & + !$omp shared(grd_ens,g_prsi,g_prsl) do k=1,grd_ens%nsig kp=k+1 do j=1,grd_ens%lon2 @@ -1032,6 +1067,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g end do end do call genqsat(g_rh,g_tsen(1,1,1),g_prsl(1,1,1),grd_ens%lat2,grd_ens%lon2,grd_ens%nsig,ice,iderivative) + !$omp parallel do default(none),private(k,i,j) & + !$omp shared(grd_ens,g_rh,g_q) do k=1,grd_ens%nsig do j=1,grd_ens%lon2 do i=1,grd_ens%lat2 @@ -1051,6 +1088,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g ! CV transform + !$omp parallel do default(none),private(k,i,j) & + !$omp shared(grd_ens,l_use_cvpqx,g_qr,cvpqx_pval,g_qs,g_qg,g_qnr,cld_nt_updt,l_cvpnr,cvpnr_pval) do k=1,grd_ens%nsig do i=1,grd_ens%lon2 do j=1,grd_ens%lat2 diff --git a/src/gsi/get_gefs_for_regional.f90 b/src/gsi/get_gefs_for_regional.f90 index cc5e0a2c8..196ad2ec8 100644 --- a/src/gsi/get_gefs_for_regional.f90 +++ b/src/gsi/get_gefs_for_regional.f90 @@ -83,6 +83,7 @@ subroutine get_gefs_for_regional use get_wrf_mass_ensperts_mod, only: get_wrf_mass_ensperts_class use gsi_io, only: verbose use obsmod, only: l_wcp_cwm + use mpi, only : MPI_Wtime, MPI_REAL8, MPI_SUCCESS implicit none type(sub2grid_info) grd_gfs,grd_mix,grd_gfst @@ -187,6 +188,8 @@ subroutine get_gefs_for_regional real(r_kind), pointer :: ges_q (:,:,:)=>NULL() logical :: print_verbose real(r_kind), allocatable :: ges_z_ens(:,:) + real(kind=8) :: time_beg,time_end,walltime, tb,te,wt + integer(i_kind) :: ierr print_verbose=.false. if(verbose)print_verbose=.true. @@ -621,6 +624,7 @@ subroutine get_gefs_for_regional rewind(10) inithead=.true. + tb=MPI_Wtime() do n=1,n_ens_gfs read(10,'(a)',err=20,end=20)filename filename=trim(ensemble_path) // trim(filename) @@ -641,8 +645,13 @@ subroutine get_gefs_for_regional call general_read_gfsatm_nems(grd_gfst,sp_gfs,filename,uv_hyb_ens,.false.,.true., & atm_bundle,.true.,iret) else if (use_gfs_ncio) then + time_beg=MPI_Wtime() call general_read_gfsatm_nc(grd_gfst,sp_gfs,filename,uv_hyb_ens,.false.,.true., & atm_bundle,.true.,iret) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime for general_read_gfsatm_nc" f15.4)') walltime else call general_read_gfsatm(grd_gfst,sp_gfs,sp_gfs,filename,uv_hyb_ens,.false.,.true., & atm_bundle,inithead,iret) @@ -955,6 +964,10 @@ subroutine get_gefs_for_regional ! if(mype==0) write(6,*)' with halo, n,min,max ges_ps - matt ps =',n,pdiffmin0,pdiffmax0 end do ! end loop over ensemble members. + te=MPI_Wtime() + call MPI_Reduce(te-tb, wt, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime to read ",I4," ensemble members ", f15.4)') n_ens_gfs,wt deallocate(ges_z_ens) diff --git a/src/gsi/gsi_files.cmake b/src/gsi/gsi_files.cmake index 95d885e2e..b7fb9e623 100644 --- a/src/gsi/gsi_files.cmake +++ b/src/gsi/gsi_files.cmake @@ -1,5 +1,6 @@ list(APPEND GSI_SRC_C blockIO.c +crc32_c.c ) # Class files for WRF interfaces @@ -264,6 +265,7 @@ guess_grids.F90 half_nmm_grid2.f90 hdraobmod.f90 hilbert_curve.f90 +crc32.F90 hybrid_ensemble_isotropic.F90 hybrid_ensemble_parameters.f90 inc2guess.f90 diff --git a/src/gsi/mod_fv3_lola.f90 b/src/gsi/mod_fv3_lola.f90 index 11bb3b6e3..b01be9fbb 100644 --- a/src/gsi/mod_fv3_lola.f90 +++ b/src/gsi/mod_fv3_lola.f90 @@ -128,8 +128,12 @@ subroutine generate_anl_grid(nx,ny,grid_lon,grid_lont,grid_lat,grid_latt) use gridmod, only:grid_ratio_fv3_regional, region_lat,region_lon,nlat,nlon use gridmod, only: region_dy,region_dx,region_dyi,region_dxi,coeffy,coeffx use gridmod, only:init_general_transform,region_dy,region_dx - use mpimod, only: mype + use mpimod, only: mype,mpi_rtype use egrid2agrid_mod, only: egrid2agrid_parm + use mpi + use crc32 + use ifcore + use ifport implicit none real(r_kind),allocatable,dimension(:)::xbh_a,xa_a,xa_b @@ -164,431 +168,641 @@ subroutine generate_anl_grid(nx,ny,grid_lon,grid_lont,grid_lat,grid_latt) real(r_kind) d(4),ds integer(i_kind) kk,k + integer(i_kind) name_len,nodeID,nodeComm,nodeRank,RanksPerNode,ierr,npes,inunit + character(len=72) :: input + character(len=5) :: np,nlatc,nlonc + logical :: res,exists + character(len=MPI_MAX_PROCESSOR_NAME) :: nodeName + !real(kind=8) :: time_beg,time_end,walltime + nord_e2a=4 bilinear=.false. + call MPI_Comm_size(mpi_comm_world,npes,ierr) + call MPI_Get_processor_name(nodeName,name_len,ierr) + nodeID=digest(trim(nodeName)) + !read(nodeName(4:9),*) nodeID ! Danger! This approach will not to work on platforms other than WCOSS2 + call MPI_Comm_split(mpi_comm_world, nodeID, mype, nodeComm, ierr) + call MPI_Comm_rank(nodeComm,nodeRank,ierr) + call MPI_Comm_size(nodeComm,RanksPerNode,ierr) + + ! Check for existing file + write(nlatc, '(i0)') nx; nlatc = adjustl(nlatc) + write(nlonc, '(i0)') ny; nlonc = adjustl(nlonc) + write(np, '(i0)') npes; np = adjustl(np) + input= 'anl_grid.'//trim(np)//'.'//trim(nlatc)//'.'//trim(nlonc) + inquire (file=trim(input), EXIST=exists) + + if(nodeRank==0) then + if (exists) then + !write(6,'("generate_anl_grid: Reading from ana_grid ",I4,2A)') mype,' ',trim(input) + inunit=2000+mype + open(inunit,file=trim(input),form='unformatted',action='read') + read(inunit) nlat,nlon,nxa,nya + if (allocated(region_dx )) deallocate(region_dx ) + if (allocated(region_dy )) deallocate(region_dy ) + allocate( region_dx(nlat,nlon), region_dy(nlat,nlon)) + allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) + allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) + if (allocated(region_lat)) deallocate(region_lat) + if (allocated(region_lon)) deallocate(region_lon) + allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) + allocate(fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) + allocate(fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) + allocate( a3dx( ny, nx),a3dx1(ny,nx),a3dy(ny,nx),a3dy1(ny,nx) ) + allocate( a3ix( ny, nx),a3ixp(ny,nx),a3jy(ny,nx),a3jyp(ny,nx) ) + allocate(cangu( nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) + + rewind(inunit) + !time_beg=MPI_Wtime() + read(inunit) nlat,nlon,nxa,nya, & + region_dx,region_dy,region_dxi,region_dyi, & + coeffx,coeffy,region_lat,region_lon, & + fv3dx,fv3dx1,fv3dy,fv3dy1,fv3ix,fv3ixp,fv3jy,fv3jyp, & + a3dx,a3dx1,a3dy,a3dy1,a3ix,a3ixp,a3jy,a3jyp, & + cangu,sangu,cangv,sangv + !time_end=MPI_Wtime() + !call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, nodeComm, ierr) + !if(mype==0) write(6,'("Maximum Walltime to read anl_grid " f15.4)') walltime + + close(inunit) + else ! create xc,yc,zc for the cell centers. - allocate(xc(nx,ny)) - allocate(yc(nx,ny)) - allocate(zc(nx,ny)) - allocate(gclat(nx,ny)) - allocate(gclon(nx,ny)) - allocate(gcrlat(nx,ny)) - allocate(gcrlon(nx,ny)) - do j=1,ny - do i=1,nx - xc(i,j)=cos(grid_latt(i,j)*deg2rad)*cos(grid_lont(i,j)*deg2rad) - yc(i,j)=cos(grid_latt(i,j)*deg2rad)*sin(grid_lont(i,j)*deg2rad) - zc(i,j)=sin(grid_latt(i,j)*deg2rad) - enddo - enddo + allocate(xc(nx,ny)) + allocate(yc(nx,ny)) + allocate(zc(nx,ny)) + allocate(gclat(nx,ny)) + allocate(gclon(nx,ny)) + allocate(gcrlat(nx,ny)) + allocate(gcrlon(nx,ny)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(ny,nx,xc,grid_latt,grid_lont,deg2rad,yc,zc) + do j=1,ny + do i=1,nx + xc(i,j)=cos(grid_latt(i,j)*deg2rad)*cos(grid_lont(i,j)*deg2rad) + yc(i,j)=cos(grid_latt(i,j)*deg2rad)*sin(grid_lont(i,j)*deg2rad) + zc(i,j)=sin(grid_latt(i,j)*deg2rad) + enddo + enddo + !$omp end parallel do ! compute center as average x,y,z coordinates of corners of domain -- - xcent=quarter*(xc(1,1)+xc(1,ny)+xc(nx,1)+xc(nx,ny)) - ycent=quarter*(yc(1,1)+yc(1,ny)+yc(nx,1)+yc(nx,ny)) - zcent=quarter*(zc(1,1)+zc(1,ny)+zc(nx,1)+zc(nx,ny)) + xcent=quarter*(xc(1,1)+xc(1,ny)+xc(nx,1)+xc(nx,ny)) + ycent=quarter*(yc(1,1)+yc(1,ny)+yc(nx,1)+yc(nx,ny)) + zcent=quarter*(zc(1,1)+zc(1,ny)+zc(nx,1)+zc(nx,ny)) - rnorm=one/sqrt(xcent**2+ycent**2+zcent**2) - xcent=rnorm*xcent - ycent=rnorm*ycent - zcent=rnorm*zcent - centlat=asin(zcent)*rad2deg - centlon=atan2(ycent,xcent)*rad2deg + rnorm=one/sqrt(xcent**2+ycent**2+zcent**2) + xcent=rnorm*xcent + ycent=rnorm*ycent + zcent=rnorm*zcent + centlat=asin(zcent)*rad2deg + centlon=atan2(ycent,xcent)*rad2deg !! compute new lats, lons - call rotate2deg(grid_lont,grid_latt,gcrlon,gcrlat, & - centlon,centlat,nx,ny) + call rotate2deg(grid_lont,grid_latt,gcrlon,gcrlat, & + centlon,centlat,nx,ny) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !! compute analysis A-grid lats, lons !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !--------------------------obtain analysis grid dimensions nxa,nya - nxa=1+nint((nx-one)/grid_ratio_fv3_regional) - nya=1+nint((ny-one)/grid_ratio_fv3_regional) - nlat=nya - nlon=nxa - if(mype==0) print *,'nlat,nlon=nya,nxa= ',nlat,nlon + nxa=1+nint((nx-one)/grid_ratio_fv3_regional) + nya=1+nint((ny-one)/grid_ratio_fv3_regional) + nlat=nya + nlon=nxa + if(mype==0) print *,'nlat,nlon=nya,nxa= ',nlat,nlon !--------------------------obtain analysis grid spacing - dlat=(maxval(gcrlat)-minval(gcrlat))/(ny-1) - dlon=(maxval(gcrlon)-minval(gcrlon))/(nx-1) - adlat=dlat*grid_ratio_fv3_regional - adlon=dlon*grid_ratio_fv3_regional + dlat=(maxval(gcrlat)-minval(gcrlat))/(ny-1) + dlon=(maxval(gcrlon)-minval(gcrlon))/(nx-1) + adlat=dlat*grid_ratio_fv3_regional + adlon=dlon*grid_ratio_fv3_regional !-------setup analysis A-grid; find center of the domain - nlonh=nlon/2 - nlath=nlat/2 - - if(nlonh*2==nlon)then - clon=adlon/two - cx=half - else - clon=adlon - cx=one - endif - - if(nlath*2==nlat)then - clat=adlat/two - cy=half - else - clat=adlat - cy=one - endif + nlonh=nlon/2 + nlath=nlat/2 + + if(nlonh*2==nlon)then + clon=adlon/two + cx=half + else + clon=adlon + cx=one + endif + + if(nlath*2==nlat)then + clat=adlat/two + cy=half + else + clat=adlat + cy=one + endif ! !-----setup analysis A-grid from center of the domain ! - allocate(rlat_in(nlat,nlon),rlon_in(nlat,nlon)) - do j=1,nlon - alon=(j-nlonh)*adlon-clon - do i=1,nlat - rlon_in(i,j)=alon - enddo - enddo - - - do j=1,nlon - do i=1,nlat - rlat_in(i,j)=(i-nlath)*adlat-clat - enddo - enddo - - if (allocated(region_dx )) deallocate(region_dx ) - if (allocated(region_dy )) deallocate(region_dy ) - allocate(region_dx(nlat,nlon),region_dy(nlat,nlon)) - allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) - allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) - dyy=rearth*adlat*deg2rad - dyyi=one/dyy - dyyh=half/dyy - do j=1,nlon - do i=1,nlat - region_dy(i,j)=dyy - region_dyi(i,j)=dyyi - coeffy(i,j)=dyyh - enddo - enddo - - do i=1,nlat - dxx=rearth*cos(rlat_in(i,1)*deg2rad)*adlon*deg2rad - dxxi=one/dxx - dxxh=half/dxx - do j=1,nlon - region_dx(i,j)=dxx - region_dxi(i,j)=dxxi - coeffx(i,j)=dxxh - enddo - enddo + allocate(rlat_in(nlat,nlon),rlon_in(nlat,nlon)) + !$omp parallel do default(none),private(j,alon,i) & + !$omp shared(nlon,nlat,rlon_in,nlonh,adlon,clon) + do j=1,nlon + alon=(j-nlonh)*adlon-clon + do i=1,nlat + rlon_in(i,j)=alon + enddo + enddo + !$omp end parallel do + + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,rlat_in,nlath,adlat,clat) + do j=1,nlon + do i=1,nlat + rlat_in(i,j)=(i-nlath)*adlat-clat + enddo + enddo + !$omp end parallel do + + if (allocated(region_dx )) deallocate(region_dx ) + if (allocated(region_dy )) deallocate(region_dy ) + allocate(region_dx(nlat,nlon),region_dy(nlat,nlon)) + allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) + allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) + dyy=rearth*adlat*deg2rad + dyyi=one/dyy + dyyh=half/dyy + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,dyy,dyyi,dyyh,region_dy,region_dyi,coeffy) + do j=1,nlon + do i=1,nlat + region_dy(i,j)=dyy + region_dyi(i,j)=dyyi + coeffy(i,j)=dyyh + enddo + enddo + !$omp end parallel do + + !$omp parallel do default(none),private(j,i,dxx,dxxi,dxxh) & + !$omp shared(nlon,nlat,rearth,rlat_in,deg2rad,adlon,region_dx,region_dxi,coeffx) + do i=1,nlat + dxx=rearth*cos(rlat_in(i,1)*deg2rad)*adlon*deg2rad + dxxi=one/dxx + dxxh=half/dxx + do j=1,nlon + region_dx(i,j)=dxx + region_dxi(i,j)=dxxi + coeffx(i,j)=dxxh + enddo + enddo + !$omp end parallel do ! !---------- setup region_lat,region_lon in earth coord ! - if (allocated(region_lat)) deallocate(region_lat) - if (allocated(region_lon)) deallocate(region_lon) - allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) - allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) + if (allocated(region_lat)) deallocate(region_lat) + if (allocated(region_lon)) deallocate(region_lon) + allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) + allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) - call unrotate2deg(region_lon,region_lat,rlon_in,rlat_in, & + call unrotate2deg(region_lon,region_lat,rlon_in,rlat_in, & centlon,centlat,nlat,nlon) - region_lat=region_lat*deg2rad - region_lon=region_lon*deg2rad - - do j=1,nlat - do i=1,nlon - glat_an(i,j)=region_lat(j,i) - glon_an(i,j)=region_lon(j,i) - enddo - enddo - - call init_general_transform(glat_an,glon_an) + region_lat=region_lat*deg2rad + region_lon=region_lon*deg2rad + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,region_lat,region_lon,glat_an,glon_an) + do j=1,nlat + do i=1,nlon + glat_an(i,j)=region_lat(j,i) + glon_an(i,j)=region_lon(j,i) + enddo + enddo + !$omp end parallel do + + call init_general_transform(glat_an,glon_an) - deallocate(glat_an,glon_an) + deallocate(glat_an,glon_an) !--------------------compute all combinations of relative coordinates - allocate(xbh_a(nx),xbh_b(nx,ny),xa_a(nxa),xa_b(nxa)) - allocate(ybh_a(ny),ybh_b(nx,ny),ya_a(nya),ya_b(nya)) + allocate(xbh_a(nx),xbh_b(nx,ny),xa_a(nxa),xa_b(nxa)) + allocate(ybh_a(ny),ybh_b(nx,ny),ya_a(nya),ya_b(nya)) - nxh=nx/2 - nyh=ny/2 + nxh=nx/2 + nyh=ny/2 !!!!!! fv3 rotated grid; not equal spacing, non_orthogonal !!!!!! - do j=1,ny - jr=ny+1-j - do i=1,nx - ir=nx+1-i - xbh_b(ir,jr)=gcrlon(i,j)/dlon - end do - end do - do j=1,ny - jr=ny+1-j - do i=1,nx - ir=nx+1-i - ybh_b(ir,jr)=gcrlat(i,j)/dlat - end do - end do + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(ny,nx,xbh_b,gcrlon,dlon) + do j=1,ny + jr=ny+1-j + do i=1,nx + ir=nx+1-i + xbh_b(ir,jr)=gcrlon(i,j)/dlon + end do + end do + !$omp end parallel do + + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(ny,nx,ybh_b,gcrlat,dlat) + do j=1,ny + jr=ny+1-j + do i=1,nx + ir=nx+1-i + ybh_b(ir,jr)=gcrlat(i,j)/dlat + end do + end do + !$omp end parallel do !!!! define analysis A grid !!!!!!!!!!!!! - do j=1,nxa - xa_a(j)=(real(j-nlonh,r_kind)-cx)*grid_ratio_fv3_regional - end do - do i=1,nya - ya_a(i)=(real(i-nlath,r_kind)-cy)*grid_ratio_fv3_regional - end do + do j=1,nxa + xa_a(j)=(real(j-nlonh,r_kind)-cx)*grid_ratio_fv3_regional + end do + do i=1,nya + ya_a(i)=(real(i-nlath,r_kind)-cy)*grid_ratio_fv3_regional + end do !!!!!compute fv3 to A grid interpolation parameters !!!!!!!!! - allocate ( fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) - allocate ( fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) - allocate(yy(ny)) + allocate ( fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) + allocate ( fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) + allocate(yy(ny)) ! iteration to find the fv3 grid cell - jb1=1 - ib1=1 - do j=1,nya - do i=1,nxa - do n=1,3 - gxa=xa_a(i) - if(gxa < xbh_b(1,jb1))then - gxa= 1 - else if(gxa > xbh_b(nx,jb1))then - gxa= nx - else - call grdcrd1(gxa,xbh_b(1,jb1),nx,1) - endif - ib2=ib1 - ib1=gxa - do jj=1,ny - yy(jj)=ybh_b(ib1,jj) - enddo - gya=ya_a(j) - if(gya < yy(1))then - gya= 1 - else if(gya > yy(ny))then - gya= ny - else - call grdcrd1(gya,yy,ny,1) - endif - jb2=jb1 - jb1=gya - if(ib1+1 > nx)then !this block( 6 lines) is copied from GSL gsi repository - ib1=ib1-1 - endif - if(jb1+1 > ny)then - jb1=jb1-1 - endif - - - if((ib1 == ib2) .and. (jb1 == jb2)) exit - if(n==3 ) then + jb1=1 + ib1=1 + !$omp parallel do default(none),private(j,i,n,gxa,ib2,jj,yy,gya,jb2,d,kk,k,ds) & + !$omp shared(nya,nxa,xa_a,xbh_b,nx,ybh_b,ya_a,ny,fv3ix,fv3ixp,fv3jy,fv3jyp,fv3dy,fv3dy1,fv3dx,fv3dx1,bilinear) & + !$omp firstprivate(jb1,ib1) + do j=1,nya + do i=1,nxa + do n=1,3 + gxa=xa_a(i) + if(gxa < xbh_b(1,jb1))then + gxa= 1 + else if(gxa > xbh_b(nx,jb1))then + gxa= nx + else + call grdcrd1(gxa,xbh_b(1,jb1),nx,1) + endif + ib2=ib1 + ib1=gxa + do jj=1,ny + yy(jj)=ybh_b(ib1,jj) + enddo + gya=ya_a(j) + if(gya < yy(1))then + gya= 1 + else if(gya > yy(ny))then + gya= ny + else + call grdcrd1(gya,yy,ny,1) + endif + jb2=jb1 + jb1=gya + if(ib1+1 > nx)then !this block( 6 lines) is copied from GSL gsi repository + ib1=ib1-1 + endif + if(jb1+1 > ny)then + jb1=jb1-1 + endif + + + if((ib1 == ib2) .and. (jb1 == jb2)) exit + if(n==3 ) then !!!!!!! if not converge, find the nearest corner point - d(1)=(xa_a(i)-xbh_b(ib1,jb1))**2+(ya_a(j)-ybh_b(ib1,jb1))**2 - d(2)=(xa_a(i)-xbh_b(ib1+1,jb1))**2+(ya_a(j)-ybh_b(ib1+1,jb1))**2 - d(3)=(xa_a(i)-xbh_b(ib1,jb1+1))**2+(ya_a(j)-ybh_b(ib1,jb1+1))**2 - d(4)=(xa_a(i)-xbh_b(ib1+1,jb1+1))**2+(ya_a(j)-ybh_b(ib1+1,jb1+1))**2 - kk=1 - do k=2,4 - if(d(k) xa_a(nxa))then - gxa= nxa - else - call grdcrd1(gxa,xa_a,nxa,1) - endif - a3ix(j,i)=int(gxa) - a3ix(j,i)=min(max(1,a3ix(j,i)),nxa) - a3dx(j,i)=max(zero,min(one,gxa-a3ix(j,i))) - a3dx1(j,i)=one-a3dx(j,i) - a3ixp(j,i)=min(nxa,a3ix(j,i)+1) - end do - end do - - do i=1,nx - do j=1,ny - gya=ybh_b(i,j) - if(gya < ya_a(1))then - gya= 1 - else if(gya > ya_a(nya))then - gya= nya - else - call grdcrd1(gya,ya_a,nya,1) - endif - a3jy(j,i)=int(gya) - a3jy(j,i)=min(max(1,a3jy(j,i)),nya) - a3dy(j,i)=max(zero,min(one,gya-a3jy(j,i))) - a3dy1(j,i)=one-a3dy(j,i) - a3jyp(j,i)=min(nya,a3jy(j,i)+1) - end do - end do + allocate ( a3dx(ny,nx),a3dx1(ny,nx),a3dy(ny,nx),a3dy1(ny,nx) ) + allocate ( a3ix(ny,nx),a3ixp(ny,nx),a3jy(ny,nx),a3jyp(ny,nx) ) + !$omp parallel do default(none),private(j,i,gxa) & + !$omp shared(ny,nx,xbh_b,xa_a,nxa,a3ix,a3dx,a3dx1,a3ixp) + do i=1,nx + do j=1,ny + gxa=xbh_b(i,j) + if(gxa < xa_a(1))then + gxa= 1 + else if(gxa > xa_a(nxa))then + gxa= nxa + else + call grdcrd1(gxa,xa_a,nxa,1) + endif + a3ix(j,i)=int(gxa) + a3ix(j,i)=min(max(1,a3ix(j,i)),nxa) + a3dx(j,i)=max(zero,min(one,gxa-a3ix(j,i))) + a3dx1(j,i)=one-a3dx(j,i) + a3ixp(j,i)=min(nxa,a3ix(j,i)+1) + end do + end do + !$omp end parallel do + + !$omp parallel do default(none),private(j,i,gya) & + !$omp shared(ny,nx,ybh_b,ya_a,nya,a3jy,a3dy,a3dy1,a3jyp) + do i=1,nx + do j=1,ny + gya=ybh_b(i,j) + if(gya < ya_a(1))then + gya= 1 + else if(gya > ya_a(nya))then + gya= nya + else + call grdcrd1(gya,ya_a,nya,1) + endif + a3jy(j,i)=int(gya) + a3jy(j,i)=min(max(1,a3jy(j,i)),nya) + a3dy(j,i)=max(zero,min(one,gya-a3jy(j,i))) + a3dy1(j,i)=one-a3dy(j,i) + a3jyp(j,i)=min(nya,a3jy(j,i)+1) + end do + end do + !$omp end parallel do !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!! find coefficients for wind conversion btw FV3 & earth !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - allocate ( cangu(nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) + allocate ( cangu(nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) ! 1. compute x,y,z at cell cornor from grid_lon, grid_lat - - do j=1,ny+1 - do i=1,nx+1 - x(i,j)=cos(grid_lat(i,j)*deg2rad)*cos(grid_lon(i,j)*deg2rad) - y(i,j)=cos(grid_lat(i,j)*deg2rad)*sin(grid_lon(i,j)*deg2rad) - z(i,j)=sin(grid_lat(i,j)*deg2rad) - enddo - enddo + !$omp parallel do default(none),private(j,i) & + !$omp shared(ny,nx,grid_lat,grid_lon,deg2rad,x,y,z) + do j=1,ny+1 + do i=1,nx+1 + x(i,j)=cos(grid_lat(i,j)*deg2rad)*cos(grid_lon(i,j)*deg2rad) + y(i,j)=cos(grid_lat(i,j)*deg2rad)*sin(grid_lon(i,j)*deg2rad) + z(i,j)=sin(grid_lat(i,j)*deg2rad) + enddo + enddo + !$omp end parallel do ! 2 find angles to E-W and N-S for U edges - sq180=180._r_kind**2 - do j=1,ny+1 - do i=1,nx -! center lat/lon of the edge - rlat=half*(grid_lat(i,j)+grid_lat(i+1,j)) - diff=(grid_lon(i,j)-grid_lon(i+1,j))**2 - if(diff < sq180)then - rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)) - else - rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)-360._r_kind) - endif + sq180=180._r_kind**2 + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xu,yu,zu,uval,ewval,nsval) & + !$omp shared(ny,nx,grid_lat,grid_lon,sq180,deg2rad,x,y,z,cangu,sangu) + do j=1,ny+1 + do i=1,nx +! center lat/lon of the edge + rlat=half*(grid_lat(i,j)+grid_lat(i+1,j)) + diff=(grid_lon(i,j)-grid_lon(i+1,j))**2 + if(diff < sq180)then + rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)) + else + rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)-360._r_kind) + endif ! vector to center of the edge - xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) - yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) - zr=sin(rlat*deg2rad) + xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) + yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) + zr=sin(rlat*deg2rad) ! vector of the edge - xu= x(i+1,j)-x(i,j) - yu= y(i+1,j)-y(i,j) - zu= z(i+1,j)-z(i,j) + xu= x(i+1,j)-x(i,j) + yu= y(i+1,j)-y(i,j) + zu= z(i+1,j)-z(i,j) ! find angle with cross product - uval=sqrt((xu**2+yu**2+zu**2)) - ewval=sqrt((xr**2+yr**2)) - nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) - cangu(i,j)=(-yr*xu+xr*yu)/ewval/uval - sangu(i,j)=(-xr*zr*xu-zr*yr*yu+(xr*xr+yr*yr)*zu) / nsval/uval - enddo - enddo + uval=sqrt((xu**2+yu**2+zu**2)) + ewval=sqrt((xr**2+yr**2)) + nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) + cangu(i,j)=(-yr*xu+xr*yu)/ewval/uval + sangu(i,j)=(-xr*zr*xu-zr*yr*yu+(xr*xr+yr*yr)*zu) / nsval/uval + enddo + enddo + !$omp end parallel do ! 3 find angles to E-W and N-S for V edges - do j=1,ny - do i=1,nx+1 - rlat=half*(grid_lat(i,j)+grid_lat(i,j+1)) - diff=(grid_lon(i,j)-grid_lon(i,j+1))**2 - if(diff < sq180)then - rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)) - else - rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)-360._r_kind) - endif - xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) - yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) - zr=sin(rlat*deg2rad) - xv= x(i,j+1)-x(i,j) - yv= y(i,j+1)-y(i,j) - zv= z(i,j+1)-z(i,j) - vval=sqrt((xv**2+yv**2+zv**2)) - ewval=sqrt((xr**2+yr**2)) - nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) - cangv(i,j)=(-yr*xv+xr*yv)/ewval/vval - sangv(i,j)=(-xr*zr*xv-zr*yr*yv+(xr*xr+yr*yr)*zv) / nsval/vval - enddo - enddo - deallocate( xc,yc,zc,gclat,gclon,gcrlat,gcrlon) - deallocate(rlat_in,rlon_in) + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xv,yv,zv,vval,ewval,nsval) & + !$omp shared(ny,nx,grid_lat,grid_lon,sq180,deg2rad,x,y,z,cangv,sangv) + do j=1,ny + do i=1,nx+1 + rlat=half*(grid_lat(i,j)+grid_lat(i,j+1)) + diff=(grid_lon(i,j)-grid_lon(i,j+1))**2 + if(diff < sq180)then + rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)) + else + rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)-360._r_kind) + endif + xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) + yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) + zr=sin(rlat*deg2rad) + xv= x(i,j+1)-x(i,j) + yv= y(i,j+1)-y(i,j) + zv= z(i,j+1)-z(i,j) + vval=sqrt((xv**2+yv**2+zv**2)) + ewval=sqrt((xr**2+yr**2)) + nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) + cangv(i,j)=(-yr*xv+xr*yv)/ewval/vval + sangv(i,j)=(-xr*zr*xv-zr*yr*yv+(xr*xr+yr*yr)*zv) / nsval/vval + enddo + enddo + !$omp end parallel do + deallocate( xc,yc,zc,gclat,gclon,gcrlat,gcrlon) + deallocate(rlat_in,rlon_in) + ! File didnt exist so we computed the data. Now save it for subsequent runs. + if(mype==0) then + inunit=2000+mype + open(inunit,file=trim(input),form='unformatted',action='write') + write(inunit) nlat,nlon,nxa,nya, & + region_dx,region_dy,region_dxi,region_dyi, & + coeffx,coeffy,region_lat,region_lon, & + fv3dx,fv3dx1,fv3dy,fv3dy1,fv3ix,fv3ixp,fv3jy,fv3jyp, & + a3dx,a3dx1,a3dy,a3dy1,a3ix,a3ixp,a3jy,a3jyp, & + cangu,sangu,cangv,sangv + !call flush(inunit) + !res = COMMITQQ(inunit) + close(inunit) + !write(6,'("generate_anl_grid: Wrote ana_grid ",I4,A)') mype, trim(input) + endif + endif ! file exists + endif ! nodeRank==0 + + call MPI_Bcast( nxa,1,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast( nya,1,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(nlat,1,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(nlon,1,mpi_integer,0,nodeComm,ierr) + if(nodeRank/=0) then + if (allocated(region_dx )) deallocate(region_dx ) + if (allocated(region_dy )) deallocate(region_dy ) + allocate( region_dx(nlat,nlon), region_dy(nlat,nlon)) + allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) + allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) + if (allocated(region_lat)) deallocate(region_lat) + if (allocated(region_lon)) deallocate(region_lon) + allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) + allocate(fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) + allocate(fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) + allocate( a3dx( ny, nx),a3dx1(ny,nx),a3dy(ny,nx),a3dy1(ny,nx) ) + allocate( a3ix( ny, nx),a3ixp(ny,nx),a3jy(ny,nx),a3jyp(ny,nx) ) + allocate(cangu( nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) + endif + + +! Broadcast arrays to the other ranks on the same node + !time_beg=MPI_Wtime() + call MPI_Bcast( region_dx,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast( region_dy,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(region_dxi,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(region_dyi,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast( coeffx,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast( coeffy,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + + call MPI_Bcast(region_lat,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(region_lon,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + !walltime=MPI_Wtime()-time_beg + + if (exists) then + ! All ranks need to call init_general_transform eventually + allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,region_lat,region_lon,glat_an,glon_an) + do j=1,nlat + do i=1,nlon + glat_an(i,j)=region_lat(j,i) + glon_an(i,j)=region_lon(j,i) + enddo + enddo + call init_general_transform(glat_an,glon_an) + deallocate(glat_an,glon_an) + else + if(nodeRank/=0) then + ! nodeRank==0 ranks already called init_general_transform + ! above so just call from remaining ranks + allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,region_lat,region_lon,glat_an,glon_an) + do j=1,nlat + do i=1,nlon + glat_an(i,j)=region_lat(j,i) + glon_an(i,j)=region_lon(j,i) + enddo + enddo + call init_general_transform(glat_an,glon_an) + deallocate(glat_an,glon_an) + endif + endif + + !time_beg=MPI_Wtime() + call MPI_Bcast(fv3dx ,nxa*nya,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(fv3dx1,nxa*nya,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(fv3dy ,nxa*nya,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(fv3dy1,nxa*nya,mpi_rtype,0,nodeComm,ierr) + + call MPI_Bcast(fv3ix ,nxa*nya,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(fv3ixp,nxa*nya,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(fv3jy ,nxa*nya,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(fv3jyp,nxa*nya,mpi_integer,0,nodeComm,ierr) + + call MPI_Bcast(a3dx ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(a3dx1 ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(a3dy ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(a3dy1 ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + + call MPI_Bcast(a3ix ,nx*ny ,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(a3ixp ,nx*ny ,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(a3jy ,nx*ny ,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(a3jyp ,nx*ny ,mpi_integer,0,nodeComm,ierr) + + call MPI_Bcast(cangu ,nx*(ny+1),mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(sangu ,nx*(ny+1),mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(cangv ,(nx+1)*ny,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(sangv ,(nx+1)*ny,mpi_rtype,0,nodeComm,ierr) + !time_end=MPI_Wtime() + !call MPI_Reduce(walltime+(time_end-time_beg), walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + !if(mype==0) write(6,'("Maximum Walltime to Bcast anl_grid " f15.4)') walltime + + call MPI_Comm_free(nodeComm,ierr) end subroutine generate_anl_grid subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_latt) @@ -667,6 +881,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l allocate(gclon(nxen,nyen)) allocate(gcrlat(nxen,nyen)) allocate(gcrlon(nxen,nyen)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(nyen,nxen,xc,grid_latt,grid_lont,deg2rad,yc,zc) do j=1,nyen do i=1,nxen xc(i,j)=cos(grid_latt(i,j)*deg2rad)*cos(grid_lont(i,j)*deg2rad) @@ -724,6 +940,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l !!!!!! fv3 rotated grid; not equal spacing, non_orthogonal !!!!!! + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(nyen,nxen,xbh_b,gcrlon,dlon) do j=1,nyen jr=nyen+1-j do i=1,nxen @@ -731,6 +949,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l xbh_b(ir,jr)=gcrlon(i,j)/dlon end do end do + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(nyen,nxen,ybh_b,gcrlat,dlat) do j=1,nyen jr=nyen+1-j do i=1,nxen @@ -757,6 +977,9 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l ! iteration to find the fv3 grid cell jb1=1 ib1=1 + !$omp parallel do default(none),private(j,i,n,gxa,ib2,jj,yy,gya,jb2,d,kk,k,ds) & + !$omp shared(nye,nxe,xa_a,xbh_b,nxen,ybh_b,ya_a,nyen,fv3ixens,fv3ixpens,fv3jyens,fv3jypens,fv3dyens,fv3dy1ens,fv3dxens,fv3dx1ens,bilinear) & + !$omp firstprivate(jb1,ib1) do j=1,nye do i=1,nxe do n=1,3 @@ -892,6 +1115,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l allocate (a3dxens(nyen,nxen),a3dx1ens(nyen,nxen),a3dyens(nyen,nxen),a3dy1ens(nyen,nxen)) allocate (a3ixens(nyen,nxen),a3ixpens(nyen,nxen),a3jyens(nyen,nxen),a3jypens(nyen,nxen)) + !$omp parallel do default(none),private(j,i,gxa) & + !$omp shared(nyen,nxen,xbh_b,xa_a,nxe,a3ixens,a3dxens,a3dx1ens,a3ixpens) do i=1,nxen do j=1,nyen gxa=xbh_b(i,j) @@ -910,6 +1135,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l end do end do + !$omp parallel do default(none),private(j,i,gya) & + !$omp shared(nyen,nxen,ybh_b,ya_a,nye,a3jyens,a3dyens,a3dy1ens,a3jypens) do i=1,nxen do j=1,nyen gya=ybh_b(i,j) @@ -935,7 +1162,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l allocate (canguens(nxen,nyen+1),sanguens(nxen,nyen+1),cangvens(nxen+1,nyen),sangvens(nxen+1,nyen)) ! 1. compute x,y,z at cell cornor from grid_lon, grid_lat - + !$omp parallel do default(none),private(j,i) & + !$omp shared(nyen,nxen,grid_lat,grid_lon,deg2rad,x,y,z) do j=1,nyen+1 do i=1,nxen+1 x(i,j)=cos(grid_lat(i,j)*deg2rad)*cos(grid_lon(i,j)*deg2rad) @@ -947,6 +1175,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l ! 2 find angles to E-W and N-S for U edges sq180=180._r_kind**2 + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xu,yu,zu,uval,ewval,nsval) & + !$omp shared(nyen,nxen,grid_lat,grid_lon,sq180,deg2rad,x,y,z,canguens,sanguens) do j=1,nyen+1 do i=1,nxen ! center lat/lon of the edge @@ -975,6 +1205,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l enddo ! 3 find angles to E-W and N-S for V edges + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xv,yv,zv,vval,ewval,nsval) & + !$omp shared(nyen,nxen,grid_lat,grid_lon,sq180,deg2rad,x,y,z,cangvens,sangvens) do j=1,nyen do i=1,nxen+1 rlat=half*(grid_lat(i,j)+grid_lat(i,j+1)) @@ -1093,6 +1325,8 @@ subroutine fv3uv2earth(u,v,nx,ny,u_out,v_out) real(r_kind),intent( out) :: u_out(nx,ny),v_out(nx,ny) integer(i_kind) i,j + !$omp parallel do default(none), private(j,i), & + !$omp shared(ny,nx,u,sangv,v,sangu,cangu,cangv,u_out,v_out) do j=1,ny do i=1,nx u_out(i,j)=half *( (u(i,j)*sangv(i,j)-v(i,j)*sangu(i,j))/(cangu(i,j)*sangv(i,j)-sangu(i,j)*cangv(i,j)) & @@ -1194,6 +1428,8 @@ subroutine fv3_h_to_ll(b_in,a,nb,mb,na,ma,rev_flg) nbp=nb+1 if(rev_flg) then !!!!!!!!! reverse E-W and N-S + !$omp parallel do default(none), private(j,jr,i,ir), & + !$omp shared(mb,mbp,nb,nbp,b_in,b) do j=1,mb jr=mbp-j do i=1,nb @@ -1206,13 +1442,17 @@ subroutine fv3_h_to_ll(b_in,a,nb,mb,na,ma,rev_flg) endif !!!!!!!!! interpolate to A grid & reverse ij for array a(lat,lon) if(bilinear)then ! bilinear interpolation + !$omp parallel do default(none), private(j,i), & + !$omp shared(ma,na,b,a,fv3dx1,fv3dy1,fv3ix,fv3jy,fv3dy,fv3jyp,fv3dx,fv3ixp) do j=1,ma do i=1,na a(j,i)=fv3dx1(i,j)*(fv3dy1(i,j)*b(fv3ix (i,j),fv3jy(i,j))+fv3dy(i,j)*b(fv3ix (i,j),fv3jyp(i,j))) & +fv3dx (i,j)*(fv3dy1(i,j)*b(fv3ixp(i,j),fv3jy(i,j))+fv3dy(i,j)*b(fv3ixp(i,j),fv3jyp(i,j))) end do end do - else ! inverse-distance weighting average + else ! inverse-distance weighting average + !$omp parallel do default(none), private(j,i), & + !$omp shared(ma,na,b,a,fv3dx,fv3ix,fv3jy,fv3dy,fv3jyp,fv3dx1,fv3ixp,fv3dy1) do j=1,ma do i=1,na a(j,i)=fv3dx(i,j)*b(fv3ix (i,j),fv3jy(i,j))+fv3dy(i,j)*b(fv3ix (i,j),fv3jyp(i,j)) & @@ -1351,7 +1591,7 @@ subroutine fv3_ll_to_h(a,b,nxa,nya,nxb,nyb,rev_flg) do j=1,nxb jr=nxbp-j b(jr+ijr)=a3dy1(i,j)*(a3dx1(i,j)*a(a3jy (i,j),a3ix(i,j))+a3dx(i,j)*a(a3jy (i,j),a3ixp(i,j))) & - +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) + +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) end do end do else @@ -1360,7 +1600,7 @@ subroutine fv3_ll_to_h(a,b,nxa,nya,nxb,nyb,rev_flg) ijr=(i-1)*nxb do j=1,nxb b(j+ijr)=a3dy1(i,j)*(a3dx1(i,j)*a(a3jy (i,j),a3ix(i,j))+a3dx(i,j)*a(a3jy (i,j),a3ixp(i,j))) & - +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) + +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) end do end do endif @@ -1418,7 +1658,8 @@ subroutine rotate2deg(rlon_in,rlat_in,rlon_out,rlat_out,rlon0,rlat0,nx,ny) real(r_kind) x,y,z, xt,yt,zt, xtt,ytt,ztt integer(i_kind) i,j - + !$omp parallel do default(none),private(j,i,x,y,z,xt,yt,zt,xtt,ytt,ztt) & + !$omp shared(ny,nx,rlat_in,rlon_in,deg2rad,rlon0,rlat0,rlat_out,rlon_out,rad2deg) do j=1,ny do i=1,nx ! 1. compute x,y,z from rlon_in, rlat_in @@ -1475,6 +1716,8 @@ subroutine unrotate2deg(rlon_in,rlat_in,rlon_out,rlat_out,rlon0,rlat0,nx,ny) real(r_kind) x,y,z, xt,yt,zt, xtt,ytt,ztt integer(i_kind) i,j + !$omp parallel do default(none),private(j,i,x,y,z,xt,yt,zt,xtt,ytt,ztt) & + !$omp shared(ny,nx,rlat_in,rlon_in,deg2rad,rlat0,rlon0,rlat_out,rlon_out,rad2deg) do j=1,ny do i=1,nx xtt=cos(rlat_out(i,j)*deg2rad)*cos(rlon_out(i,j)*deg2rad) diff --git a/src/gsi/pcgsoi.f90 b/src/gsi/pcgsoi.f90 index 0b808c5c5..266c43b77 100644 --- a/src/gsi/pcgsoi.f90 +++ b/src/gsi/pcgsoi.f90 @@ -159,6 +159,7 @@ subroutine pcgsoi() use berror, only: vprecond use stpjomod, only: stpjo_setup use intradmod, only: setrad + use mpi, only : MPI_Wtime, MPI_Comm_World, MPI_REAL8, MPI_MAX, MPI_SUCCESS implicit none @@ -191,6 +192,8 @@ subroutine pcgsoi() integer(i_kind) :: iortho logical :: print_verbose,ortho,diag_print logical :: lanlerr,read_success + real(kind=8) :: time_beg,time_end,walltime + integer(i_kind) :: ierr ! Step size diagnostic strings data step /'good', 'SMALL'/ @@ -638,7 +641,14 @@ subroutine pcgsoi() ! Write output analysis files if(.not.l4dvar) call prt_guess('analysis') call prt_state_norms(sval(1),'increment') - if (twodvar_regional .or. jiter == miter) call write_all(-1) + if (twodvar_regional .or. jiter == miter) then + time_beg=MPI_Wtime() + call write_all(-1) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime for write_all" f15.4)') walltime + endif ! Overwrite guess with increment (4d-var only, for now) if (iwrtinc>0) then From 1ae752ddfcfd4914e81af8d5594c276672a1621f Mon Sep 17 00:00:00 2001 From: Shun Liu Date: Mon, 2 Dec 2024 21:10:34 +0000 Subject: [PATCH 03/12] add GDIT's performance optimization for RRFS --- src/gsi/crc32.F90 | 23 +++++++++++++++++++++++ src/gsi/crc32_c.c | 7 +++++++ 2 files changed, 30 insertions(+) create mode 100644 src/gsi/crc32.F90 create mode 100644 src/gsi/crc32_c.c diff --git a/src/gsi/crc32.F90 b/src/gsi/crc32.F90 new file mode 100644 index 000000000..dd22e4c80 --- /dev/null +++ b/src/gsi/crc32.F90 @@ -0,0 +1,23 @@ +module crc32 + use iso_c_binding + implicit none + public :: digest + + interface + integer function digest_c(message) bind(c) + use iso_c_binding, only: c_char + character(kind=c_char), intent(in) :: message(*) + end function digest_c + end interface + + contains + + integer function digest(m) + use iso_c_binding, only: c_null_char + implicit none + character(len=*), intent(in) :: m + !m='nid001019' + digest=abs(digest_c(trim(m)//c_null_char)) + !write(6,'("Digest ",I12)') digest + end function digest +end module crc32 diff --git a/src/gsi/crc32_c.c b/src/gsi/crc32_c.c new file mode 100644 index 000000000..049a725ba --- /dev/null +++ b/src/gsi/crc32_c.c @@ -0,0 +1,7 @@ +#include +#include +#include + +uLong digest_c(char * message) { + return crc32(0, (const void*)message, strlen(message)); +} From 0b21bad1970cb6ea88f4fac33b489e68f5bde0c1 Mon Sep 17 00:00:00 2001 From: Shun Liu Date: Mon, 2 Dec 2024 21:14:15 +0000 Subject: [PATCH 04/12] initialize variables in observer_fv3reg.f90 to avoid the warning message when compiling --- src/enkf/observer_fv3reg.f90 | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/enkf/observer_fv3reg.f90 b/src/enkf/observer_fv3reg.f90 index 8f77bca41..9084ee58f 100644 --- a/src/enkf/observer_fv3reg.f90 +++ b/src/enkf/observer_fv3reg.f90 @@ -45,7 +45,7 @@ subroutine setup_linhx(rlat, rlon, time, ix, delx, ixp, delxp, iy, dely, & use params, only: nstatefields, nlons, nlats, nlevs, nhr_state, fhr_assim use gridinfo, only: npts, latsgrd, lonsgrd use statevec, only: nsdim - use constants, only: zero,one,pi + use constants, only: izero,zero,one,pi use mpisetup implicit none @@ -54,6 +54,18 @@ subroutine setup_linhx(rlat, rlon, time, ix, delx, ixp, delxp, iy, dely, & real(r_single) ,intent(in ) :: time ! observation time relative to middle of window integer(i_kind), intent(out) :: ix, iy, it, ixp, iyp, itp real(r_kind), intent(out) :: delx, dely, delxp, delyp, delt, deltp + ix=izero + iy=izero + it=izero + ixp=izero + iyp=izero + itp=izero + delx=zero + dely=zero + delxp=zero + delyp=zero + delt=zero + deltp=zero write(6,*)'this is a dummy subroutine, running this means something wrong ,stop' call stop2(555) @@ -110,6 +122,7 @@ subroutine calc_linhx(hx, dens, dhx_dx, hxpert, hx_ens, & type(raggedarr) ,intent(inout) :: hxpert ! interpolated background real(r_single) ,intent( out) :: hx_ens ! H (x_ens) integer(i_kind) i,j + hx_ens=zero write(6,*)'this is a dummy subroutine, running this means something wrong ,stop' call stop2(555) @@ -145,6 +158,7 @@ subroutine calc_linhx_modens(hx, dhx_dx, hxpert, hx_ens, vscale) !$$$ use kinds, only: r_kind,i_kind,r_single use sparsearr, only: sparr, raggedarr + use constants, only: zero use mpisetup implicit none @@ -155,6 +169,7 @@ subroutine calc_linhx_modens(hx, dhx_dx, hxpert, hx_ens, vscale) real(r_single) ,intent( out) :: hx_ens(neigv)! H (x_ens) real(r_double),dimension(neigv,nlevs+1) ,intent(in ) :: vscale ! vertical scaling (for modulated ens) integer(i_kind) i + hx_ens=zero write(6,*)'this is a dummy subroutine, running this means something wrong ,stop' call stop2(555) From ffabb40ef752b08a4e7cc9179e6a7517c4279511 Mon Sep 17 00:00:00 2001 From: Shun Liu Date: Thu, 6 Mar 2025 21:17:57 +0000 Subject: [PATCH 05/12] update github ci yamls --- .github/workflows/gcc.yml | 19 +++++++++---------- .github/workflows/intel.yml | 16 ++++++++++------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 1f6fa3afc..eef57cecb 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -9,9 +9,9 @@ defaults: env: cache_key: gcc - CC: gcc-10 - FC: gfortran-10 - CXX: g++-10 + CC: gcc-13 + FC: gfortran-13 + CXX: g++-13 # The jobs are split into: # 1. a dependency build step (setup), and @@ -34,7 +34,7 @@ jobs: # Cache spack, compiler and dependencies - name: cache-env id: cache-env - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | spack @@ -43,19 +43,18 @@ jobs: # Install dependencies using Spack - name: install-dependencies-with-spack - if: steps.cache-env.outputs.cache-hit != 'true' + # if: steps.cache-env.outputs.cache-hit != 'true' run: | sudo apt-get install cmake + rm -rf spack git clone -c feature.manyFiles=true https://github.com/JCSDA/spack.git source spack/share/spack/setup-env.sh - spack env create gsi-env gsi/ci/spack.yaml + spack env create gsi-env gsi/ci/spack_gcc.yaml spack env activate gsi-env spack compiler find - sudo apt install cmake spack external find - spack add mpich@3.4.2 spack concretize - spack install -v --fail-fast --dirty + spack install --fail-fast --dirty spack clean -a gsi: @@ -70,7 +69,7 @@ jobs: - name: cache-env id: cache-env - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | spack diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index d21420687..9d2dc4c29 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -34,7 +34,9 @@ jobs: sudo swapoff -a sudo rm -f /swapfile sudo apt clean - docker rmi $(docker image ls -aq) + DOCKER_IMGS=$(docker image ls -aq) + if [[ ! -z "${DOCKER_IMGS}" ]]; then docker rmi ${DOCKER_IMGS}; fi + df -h # Checkout the GSI to get the ci/spack.yaml file - name: checkout @@ -52,9 +54,10 @@ jobs: spack ~/.spack /opt/intel - key: spack-${{ runner.os }}-${{ env.cache_key }}-${{ hashFiles('gsi/ci/spack.yaml') }} + key: spack-${{ runner.os }}-${{ env.cache_key }}-${{ hashFiles('gsi/ci/spack.yaml') }}-1 - name: install-intel-compilers + if: steps.cache-env.outputs.cache-hit != 'true' run: | wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB @@ -70,16 +73,15 @@ jobs: run: | sudo mv /usr/local/ /usr_local_mv sudo apt-get install cmake + rm -rf spack git clone -c feature.manyFiles=true https://github.com/JCSDA/spack.git source spack/share/spack/setup-env.sh - spack env create gsi-env gsi/ci/spack.yaml + spack env create gsi-env gsi/ci/spack_intel.yaml spack env activate gsi-env spack compiler find - sudo apt install cmake spack external find - spack add intel-oneapi-mpi spack concretize - spack install -v --fail-fast --dirty + spack install --fail-fast --dirty spack clean -a gsi: @@ -108,6 +110,8 @@ jobs: - name: build run: | + sudo mv /usr/local/ /usr_local_mv + sudo apt-get install cmake libblas-dev liblapack-dev source spack/share/spack/setup-env.sh spack env activate gsi-env cd gsi From c814d948fd8128916ad562e1576211f331185541 Mon Sep 17 00:00:00 2001 From: Shun Liu Date: Fri, 7 Mar 2025 15:34:35 +0000 Subject: [PATCH 06/12] add gdit's modification --- src/gsi/hybrid_ensemble_isotropic.F90 | 228 ++++++++++++++++++++------ 1 file changed, 177 insertions(+), 51 deletions(-) diff --git a/src/gsi/hybrid_ensemble_isotropic.F90 b/src/gsi/hybrid_ensemble_isotropic.F90 index 87f3605ea..bcf3a973b 100644 --- a/src/gsi/hybrid_ensemble_isotropic.F90 +++ b/src/gsi/hybrid_ensemble_isotropic.F90 @@ -732,6 +732,8 @@ subroutine new_factorization_rf_y(f,iadvance,iback,nlevs,ig) nx=grd_loc%nlon ; ny=grd_loc%nlat ; nz=nlevs if(vvlocal)then +!$omp parallel do default(none), schedule(static,1) private(k,j,i,l) & +!$omp shared(nx,ny,nz,ig,iadvance,ynorm_new,f,fmaty,fmat0y,iback) do k=1,nz do j=1,nx @@ -764,6 +766,8 @@ subroutine new_factorization_rf_y(f,iadvance,iback,nlevs,ig) enddo enddo else +!$omp parallel do default(none),schedule(static,1) private(k,j,i,l) & +!$omp shared(nx,ny,nz,ig,iadvance,ynorm_new,f,fmaty,fmat0y,iback) do k=1,nz do j=1,nx @@ -908,12 +912,22 @@ subroutine normal_new_factorization_rf_x use hybrid_ensemble_parameters, only: grd_loc,vvlocal use hybrid_ensemble_parameters, only: naensgrp,naensloc use constants, only: zero,one + use mpimod, only: mpi_rtype + use mpi + use crc32 implicit none - integer(i_kind) i,j,k,iadvance,iback,kl,ig + integer(i_kind) i,j,k,iadvance,iback,kl,ig,npes,inunit + integer(i_kind) name_len,message_len,nodeID,nodeComm,nodeRank,RanksPerNode,ierr real(r_kind) f(grd_loc%nlat,grd_loc%nlon,grd_loc%kend_alloc+1-grd_loc%kbegin_loc) real(r_kind),allocatable:: diag(:,:,:) + real(i_kind),allocatable:: sz(:) + character(len=72) :: input + character(len=5) :: np,nlat,nlon + logical :: exists + character(len=MPI_MAX_PROCESSOR_NAME) :: nodeName + character(len=MPI_MAX_ERROR_STRING) :: message ! real(r_kind) diag(grd_loc%nlat,grd_loc%nlon) ! possible to have kend_loc - kbegin_loc-1 for processors not involved @@ -922,63 +936,175 @@ subroutine normal_new_factorization_rf_x if(grd_loc%kend_loc < grd_loc%kbegin_loc) return if(vvlocal)then - kl=grd_loc%kend_alloc+1-grd_loc%kbegin_loc + kl=grd_loc%kend_alloc+1-grd_loc%kbegin_loc else - kl=1 + kl=1 endif - if(allocated(xnorm_new)) deallocate(xnorm_new) - allocate(xnorm_new(grd_loc%nlat,grd_loc%nlon,kl,naensloc)) - if(allocated(diag)) deallocate(diag) - allocate(diag(grd_loc%nlat,grd_loc%nlon,kl)) - xnorm_new=one - do ig=1,naensgrp - do j=1,grd_loc%nlon - f=zero - do k=1,kl - do i=1,grd_loc%nlat - f(i,j,k)=one - enddo - enddo - iadvance=1 ; iback=2 - call new_factorization_rf_x(f,iadvance,iback,kl,ig) - iadvance=2 ; iback=1 - call new_factorization_rf_x(f,iadvance,iback,kl,ig) - do k=1,kl - do i=1,grd_loc%nlat - diag(i,j,k)=sqrt(one/f(i,j,k)) - enddo - enddo - enddo - do k=1,kl - do j=1,grd_loc%nlon - do i=1,grd_loc%nlat - xnorm_new(i,j,k,ig)=diag(i,j,k) - enddo - enddo - enddo - enddo !ig loop +! The Optimization depends on each rank on a node having the same sized xnorm_new array. +! This is generally not true when vvlocal=.true. (readin_localization in gsiparm.anl) because kl varies in that case + call MPI_Comm_size(MPI_COMM_WORLD,npes,ierr) + allocate(sz(npes)) + call MPI_Allgather((grd_loc%nlat*grd_loc%nlon*kl*naensloc),1,MPI_Integer,sz,1,MPI_Integer,MPI_COMM_WORLD,ierr) + + if(vvlocal .or. any(sz /= sz(1))) then + ! Must generate xnorm_new on all ranks + if (mype==0) write(6,'("new_factorization_rf_x: Default")') + if(allocated(xnorm_new)) deallocate(xnorm_new) + allocate(xnorm_new(grd_loc%nlat,grd_loc%nlon,kl,naensloc)) + !write(6,'("new_factorization_rf_x: Default ",5I6)'),mype,grd_loc%nlat,grd_loc%nlon,kl,naensloc + if(allocated(diag)) deallocate(diag) + allocate(diag(grd_loc%nlat,grd_loc%nlon,kl)) + xnorm_new=one + + do ig=1,naensgrp + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=sqrt(one/f(i,j,k)) + enddo + enddo + enddo + do k=1,kl + do j=1,grd_loc%nlon + do i=1,grd_loc%nlat + xnorm_new(i,j,k,ig)=diag(i,j,k) + enddo + enddo + enddo + enddo !ig loop ! check accuracy of xnorm - if(debug) then - do j=1,grd_loc%nlon - f=zero - do k=1,kl - do i=1,grd_loc%nlat - f(i,j,k)=one + if(debug) then + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=f(i,j,k) + enddo + enddo + enddo + write(6,*)' in normal_new_factorization_rf_x,min,max(diag)=',minval(diag),maxval(diag) + endif + elseif (any(sz == sz(1))) then ! Use optimization + if(mype==0) write(6,'("new_factorization_rf_x: Opt")') + !write(6,'("new_factorization_rf_x: Opt ",5I6)'),mype,grd_loc%nlat,grd_loc%nlon,kl,naensloc + ! Fill xnorm_new using one rank per compute node to avoid + ! contension for memory bandwidth + call MPI_Get_processor_name(nodeName,name_len,ierr) + nodeID=digest(trim(nodeName)) + !read(nodeName(4:9),*) nodeID ! Danger! This approach will not to work on platforms other than WCOSS2 + call MPI_Comm_split(mpi_comm_world, nodeID, mype, nodeComm, ierr) + call MPI_Comm_rank(nodeComm,nodeRank,ierr) + call MPI_Comm_size(nodeComm,RanksPerNode,ierr) + + if(allocated(xnorm_new)) deallocate(xnorm_new) + allocate(xnorm_new(grd_loc%nlat,grd_loc%nlon,kl,naensloc)) + + if(nodeRank==0) then + ! Check for existing file + write(nlat, '(i0)') grd_loc%nlat; nlat = adjustl(nlat) + write(nlon, '(i0)') grd_loc%nlon; nlon = adjustl(nlon) + write(np, '(i0)') npes; np = adjustl(np) + input= 'xnorm_new.'//trim(np)//'.'//trim(nlat)//'.'//trim(nlon) + inquire (file=trim(input), EXIST=exists) + if (exists) then + inunit=2000+mype + open(inunit,file=trim(input),form='unformatted',action='read') + read(inunit) xnorm_new + close(inunit) + else ! Generate the data and have rank 0 write it out for later use + if(allocated(diag)) deallocate(diag) + allocate(diag(grd_loc%nlat,grd_loc%nlon,kl)) + xnorm_new=one + + do ig=1,naensgrp + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=sqrt(one/f(i,j,k)) + enddo + enddo enddo - enddo - iadvance=1 ; iback=2 - call new_factorization_rf_x(f,iadvance,iback,kl,1) - iadvance=2 ; iback=1 - call new_factorization_rf_x(f,iadvance,iback,kl,1) - do k=1,kl - do i=1,grd_loc%nlat - diag(i,j,k)=f(i,j,k) + do k=1,kl + do j=1,grd_loc%nlon + do i=1,grd_loc%nlat + xnorm_new(i,j,k,ig)=diag(i,j,k) + enddo + enddo enddo - enddo - enddo - write(6,*)' in normal_new_factorization_rf_x,min,max(diag)=',minval(diag),maxval(diag) + enddo !ig loop +! check accuracy of xnorm + if(debug) then + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=f(i,j,k) + enddo + enddo + enddo + write(6,*)' in normal_new_factorization_rf_x,min,max(diag)=',minval(diag),maxval(diag) + endif + ! File didnt exist so we computed the data. Now save it for subsequent runs. + if(mype==0) then + open(inunit,file=trim(input),form='unformatted',action='write') + write(inunit) xnorm_new + close(inunit) + endif + endif ! file exists + endif ! nodeRank==0 + +! Broadcast xnorm_new to the other ranks on the same node + call MPI_Bcast(xnorm_new,size(xnorm_new),mpi_rtype,0,nodeComm,ierr) + if(ierr /= MPI_SUCCESS) then + call MPI_Error_string(ierr, message, message_len, ierr) + write(6,'("new_factorization_rf_x: Error from MPI_Bcast ",I4,A)') mype, trim(message) + call MPI_Abort(MPI_COMM_WORLD, 10, ierr) + endif + call MPI_Comm_free(nodeComm,ierr) + else + write(6,'("new_factorization_rf_x: Scenario not yet supported")') + call MPI_Abort(MPI_COMM_WORLD, 13, ierr) endif + return end subroutine normal_new_factorization_rf_x From 5a9f28b818f6d3bdb1a1e06b48345c3041648a5e Mon Sep 17 00:00:00 2001 From: Shun Liu Date: Mon, 17 Mar 2025 19:38:25 +0000 Subject: [PATCH 07/12] use indent format with three spaces --- src/gsi/hybrid_ensemble_isotropic.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gsi/hybrid_ensemble_isotropic.F90 b/src/gsi/hybrid_ensemble_isotropic.F90 index bcf3a973b..d1c8a7211 100644 --- a/src/gsi/hybrid_ensemble_isotropic.F90 +++ b/src/gsi/hybrid_ensemble_isotropic.F90 @@ -936,9 +936,9 @@ subroutine normal_new_factorization_rf_x if(grd_loc%kend_loc < grd_loc%kbegin_loc) return if(vvlocal)then - kl=grd_loc%kend_alloc+1-grd_loc%kbegin_loc + kl=grd_loc%kend_alloc+1-grd_loc%kbegin_loc else - kl=1 + kl=1 endif ! The Optimization depends on each rank on a node having the same sized xnorm_new array. From 0893efc0c3e5a86016d9051243681d70ef7a1cc6 Mon Sep 17 00:00:00 2001 From: ShunLiu-NOAA Date: Wed, 19 Mar 2025 09:50:19 -0700 Subject: [PATCH 08/12] rrfs.v1.0.0 release branch update with GDIT's performance optimization (#841) add GDIT's performance optimization for RRFS to release branch initialize variables in observer_fv3reg.f90 to avoid the warning message when compiling --- .github/workflows/gcc.yml | 19 +- .github/workflows/intel.yml | 16 +- modulefiles/gsi_wcoss2.intel.lua | 6 +- src/enkf/observer_fv3reg.f90 | 17 +- src/gsi/CMakeLists.txt | 2 + src/gsi/cplr_get_fv3_regional_ensperts.f90 | 47 +- src/gsi/crc32.F90 | 23 + src/gsi/crc32_c.c | 7 + src/gsi/get_gefs_for_regional.f90 | 13 + src/gsi/gsi_files.cmake | 2 + src/gsi/hybrid_ensemble_isotropic.F90 | 224 +++-- src/gsi/mod_fv3_lola.f90 | 975 +++++++++++++-------- src/gsi/pcgsoi.f90 | 12 +- 13 files changed, 925 insertions(+), 438 deletions(-) create mode 100644 src/gsi/crc32.F90 create mode 100644 src/gsi/crc32_c.c diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 1f6fa3afc..eef57cecb 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -9,9 +9,9 @@ defaults: env: cache_key: gcc - CC: gcc-10 - FC: gfortran-10 - CXX: g++-10 + CC: gcc-13 + FC: gfortran-13 + CXX: g++-13 # The jobs are split into: # 1. a dependency build step (setup), and @@ -34,7 +34,7 @@ jobs: # Cache spack, compiler and dependencies - name: cache-env id: cache-env - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | spack @@ -43,19 +43,18 @@ jobs: # Install dependencies using Spack - name: install-dependencies-with-spack - if: steps.cache-env.outputs.cache-hit != 'true' + # if: steps.cache-env.outputs.cache-hit != 'true' run: | sudo apt-get install cmake + rm -rf spack git clone -c feature.manyFiles=true https://github.com/JCSDA/spack.git source spack/share/spack/setup-env.sh - spack env create gsi-env gsi/ci/spack.yaml + spack env create gsi-env gsi/ci/spack_gcc.yaml spack env activate gsi-env spack compiler find - sudo apt install cmake spack external find - spack add mpich@3.4.2 spack concretize - spack install -v --fail-fast --dirty + spack install --fail-fast --dirty spack clean -a gsi: @@ -70,7 +69,7 @@ jobs: - name: cache-env id: cache-env - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | spack diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index d21420687..9d2dc4c29 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -34,7 +34,9 @@ jobs: sudo swapoff -a sudo rm -f /swapfile sudo apt clean - docker rmi $(docker image ls -aq) + DOCKER_IMGS=$(docker image ls -aq) + if [[ ! -z "${DOCKER_IMGS}" ]]; then docker rmi ${DOCKER_IMGS}; fi + df -h # Checkout the GSI to get the ci/spack.yaml file - name: checkout @@ -52,9 +54,10 @@ jobs: spack ~/.spack /opt/intel - key: spack-${{ runner.os }}-${{ env.cache_key }}-${{ hashFiles('gsi/ci/spack.yaml') }} + key: spack-${{ runner.os }}-${{ env.cache_key }}-${{ hashFiles('gsi/ci/spack.yaml') }}-1 - name: install-intel-compilers + if: steps.cache-env.outputs.cache-hit != 'true' run: | wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB @@ -70,16 +73,15 @@ jobs: run: | sudo mv /usr/local/ /usr_local_mv sudo apt-get install cmake + rm -rf spack git clone -c feature.manyFiles=true https://github.com/JCSDA/spack.git source spack/share/spack/setup-env.sh - spack env create gsi-env gsi/ci/spack.yaml + spack env create gsi-env gsi/ci/spack_intel.yaml spack env activate gsi-env spack compiler find - sudo apt install cmake spack external find - spack add intel-oneapi-mpi spack concretize - spack install -v --fail-fast --dirty + spack install --fail-fast --dirty spack clean -a gsi: @@ -108,6 +110,8 @@ jobs: - name: build run: | + sudo mv /usr/local/ /usr_local_mv + sudo apt-get install cmake libblas-dev liblapack-dev source spack/share/spack/setup-env.sh spack env activate gsi-env cd gsi diff --git a/modulefiles/gsi_wcoss2.intel.lua b/modulefiles/gsi_wcoss2.intel.lua index c3bfd1156..116baedb9 100644 --- a/modulefiles/gsi_wcoss2.intel.lua +++ b/modulefiles/gsi_wcoss2.intel.lua @@ -9,7 +9,9 @@ local cmake_ver= os.getenv("cmake_ver") or "3.20.2" local python_ver=os.getenv("python_ver") or "3.8.6" local prod_util_ver=os.getenv("prod_util_ver") or "2.0.10" -local netcdf_ver=os.getenv("netcdf_ver") or "4.7.4" +local zlib_ver=os.getenv("zlib_ver") or "1.2.11" +local hdf5_ver=os.getenv("hdf5_ver") or "1.14.0" +local netcdf_ver=os.getenv("netcdf_ver") or "4.9.2" local bufr_ver=os.getenv("bufr_ver") or "11.7.0" local bacio_ver=os.getenv("bacio_ver") or "2.4.1" local w3emc_ver=os.getenv("w3emc_ver") or "2.9.2" @@ -32,6 +34,8 @@ load(pathJoin("python", python_ver)) load(pathJoin("prod_util", prod_util_ver)) +load(pathJoin("zlib", zlib_ver)) +load(pathJoin("hdf5", hdf5_ver)) load(pathJoin("netcdf", netcdf_ver)) load(pathJoin("bufr", bufr_ver)) load(pathJoin("bacio", bacio_ver)) diff --git a/src/enkf/observer_fv3reg.f90 b/src/enkf/observer_fv3reg.f90 index 8f77bca41..9084ee58f 100644 --- a/src/enkf/observer_fv3reg.f90 +++ b/src/enkf/observer_fv3reg.f90 @@ -45,7 +45,7 @@ subroutine setup_linhx(rlat, rlon, time, ix, delx, ixp, delxp, iy, dely, & use params, only: nstatefields, nlons, nlats, nlevs, nhr_state, fhr_assim use gridinfo, only: npts, latsgrd, lonsgrd use statevec, only: nsdim - use constants, only: zero,one,pi + use constants, only: izero,zero,one,pi use mpisetup implicit none @@ -54,6 +54,18 @@ subroutine setup_linhx(rlat, rlon, time, ix, delx, ixp, delxp, iy, dely, & real(r_single) ,intent(in ) :: time ! observation time relative to middle of window integer(i_kind), intent(out) :: ix, iy, it, ixp, iyp, itp real(r_kind), intent(out) :: delx, dely, delxp, delyp, delt, deltp + ix=izero + iy=izero + it=izero + ixp=izero + iyp=izero + itp=izero + delx=zero + dely=zero + delxp=zero + delyp=zero + delt=zero + deltp=zero write(6,*)'this is a dummy subroutine, running this means something wrong ,stop' call stop2(555) @@ -110,6 +122,7 @@ subroutine calc_linhx(hx, dens, dhx_dx, hxpert, hx_ens, & type(raggedarr) ,intent(inout) :: hxpert ! interpolated background real(r_single) ,intent( out) :: hx_ens ! H (x_ens) integer(i_kind) i,j + hx_ens=zero write(6,*)'this is a dummy subroutine, running this means something wrong ,stop' call stop2(555) @@ -145,6 +158,7 @@ subroutine calc_linhx_modens(hx, dhx_dx, hxpert, hx_ens, vscale) !$$$ use kinds, only: r_kind,i_kind,r_single use sparsearr, only: sparr, raggedarr + use constants, only: zero use mpisetup implicit none @@ -155,6 +169,7 @@ subroutine calc_linhx_modens(hx, dhx_dx, hxpert, hx_ens, vscale) real(r_single) ,intent( out) :: hx_ens(neigv)! H (x_ens) real(r_double),dimension(neigv,nlevs+1) ,intent(in ) :: vscale ! vertical scaling (for modulated ens) integer(i_kind) i + hx_ens=zero write(6,*)'this is a dummy subroutine, running this means something wrong ,stop' call stop2(555) diff --git a/src/gsi/CMakeLists.txt b/src/gsi/CMakeLists.txt index f894b0a8a..d7a02ed99 100644 --- a/src/gsi/CMakeLists.txt +++ b/src/gsi/CMakeLists.txt @@ -62,6 +62,7 @@ find_package(NetCDF REQUIRED Fortran) if(OPENMP) find_package(OpenMP REQUIRED) endif() +find_package(ZLIB REQUIRED) # NCEPLibs dependencies find_package(bacio REQUIRED) @@ -157,6 +158,7 @@ target_link_libraries(gsi_fortran_obj PUBLIC w3emc::w3emc_d) target_link_libraries(gsi_fortran_obj PUBLIC sp::sp_d) target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_d) target_link_libraries(gsi_fortran_obj PUBLIC crtm::crtm) +target_link_libraries(gsi_fortran_obj PUBLIC ZLIB::ZLIB) if(GSI_MODE MATCHES "Regional") target_link_libraries(gsi_fortran_obj PUBLIC wrf_io::wrf_io) endif() diff --git a/src/gsi/cplr_get_fv3_regional_ensperts.f90 b/src/gsi/cplr_get_fv3_regional_ensperts.f90 index 9b841f012..55aa41029 100644 --- a/src/gsi/cplr_get_fv3_regional_ensperts.f90 +++ b/src/gsi/cplr_get_fv3_regional_ensperts.f90 @@ -76,6 +76,7 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) use netcdf_mod , only: nc_check use gsi_rfv3io_mod, only: fv3lam_io_phymetvars3d_nouv use obsmod, only: if_model_dbz,if_model_fed + use mpi, only : MPI_Wtime, MPI_REAL8, MPI_SUCCESS, MPI_MAX implicit none @@ -119,6 +120,8 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) integer(i_kind):: imem_start,n_fv3sar integer(i_kind):: i_caseflag + real(kind=8) :: time_beg,time_end,walltime, tb,te,wt + integer(i_kind) :: ierr if(n_ens/=(n_ens_gfs+n_ens_fv3sar)) then write(6,*)'wrong, the sum of n_ens_gfs and n_ens_fv3sar not equal n_ens, stop' @@ -275,7 +278,7 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) end if end if - + tb=MPI_Wtime() do m=1,ntlevs_ens @@ -452,7 +455,8 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) if( .not. parallelization_over_ensmembers )then if (mype == 0) write(6,'(a,a)') & 'CALL READ_FV3_REGIONAL_ENSPERTS FOR ENS DATA with the filename str : ',trim(ensfilenam_str) - + + time_beg=MPI_Wtime() select case (i_caseflag) case (0) call this%general_read_fv3_regional(fv3_filename,ps,u,v,tv,rh,oz) @@ -469,9 +473,15 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) call this%general_read_fv3_regional(fv3_filename,ps,u,v,tv,rh,oz, & g_ql=ql,g_qi=qi,g_qr=qr,g_qs=qs,g_qg=qg,g_qnr=qnr,g_w=w,g_fed=fed) case (5) + !write(6,'("get_fv3_regional_ensperts_run: Before general_read_fv3_regional")') call this%general_read_fv3_regional(fv3_filename,ps,u,v,tv,rh,oz, & g_ql=ql,g_qi=qi,g_qr=qr,g_qs=qs,g_qg=qg,g_qnr=qnr,g_w=w,g_dbz=dbz,g_fed=fed) end select + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime for general_read_fv3_regional" f15.4,I4)') walltime,i_caseflag + end if if( parallelization_over_ensmembers )then @@ -792,6 +802,11 @@ subroutine get_fv3_regional_ensperts_run(this,en_perts,nelen,ps_bar) end do enddo ! it 4d loop + te=MPI_Wtime() + call MPI_Reduce(te-tb, wt, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime to read ",I4," ensemble members ", f15.4)') n_ens_fv3sar,wt + ! CALCULATE ENSEMBLE SPREAD if(write_ens_sprd ) then call this%ens_spread_dualres_regional(mype,en_perts,nelen) @@ -851,7 +866,7 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g use hybrid_ensemble_parameters, only: grd_ens,q_hyb_ens use hybrid_ensemble_parameters, only: fv3sar_ensemble_opt,dual_res - use mpimod, only: mpi_comm_world,mpi_rtype + use mpimod, only: mpi_comm_world,mpi_rtype,mype use gsi_rfv3io_mod,only: type_fv3regfilenameg use gsi_rfv3io_mod,only:n2d use constants, only: half,zero @@ -870,6 +885,7 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g use directDA_radaruse_mod, only: l_use_cvpqx, cvpqx_pval, cld_nt_updt use directDA_radaruse_mod, only: l_cvpnr, cvpnr_pval use obsmod, only:if_model_dbz,if_model_fed + use mpi, only : MPI_Wtime, MPI_REAL8, MPI_MAX, MPI_SUCCESS implicit none @@ -913,6 +929,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g character(len=:),allocatable :: sfcdata !='fv3_sfcdata' character(len=:),allocatable :: couplerres!='coupler.res' integer (i_kind) ier,istatus + real(kind=8) :: time_beg,time_end,walltime + integer(i_kind) :: ierr associate( this => this ) ! eliminates warning for unused dummy argument needed for binding @@ -930,6 +948,7 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g couplerres=fv3_filenameginput%couplerres + !write(6,'("general_read_fv3_regional: fv3sar_ensemble_opt= " I4)') fv3sar_ensemble_opt if (allocated(fv3lam_ens_io_dynmetvars2d_nouv) ) then @@ -956,16 +975,28 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g end if - if(fv3sar_ensemble_opt == 0 ) then + if(fv3sar_ensemble_opt == 0 ) then + time_beg=MPI_Wtime() call gsi_fv3ncdf_readuv(grd_fv3lam_ens_uv,g_u,g_v,fv3_filenameginput,dual_res) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("general_read_fv3_regional: Maximum Walltime for gsi_fv3ncdf_readuv" f15.4)') walltime + else call gsi_fv3ncdf_readuv_v1(grd_fv3lam_ens_uv,g_u,g_v,fv3_filenameginput,dual_res) endif if(fv3sar_ensemble_opt == 0) then + time_beg=MPI_Wtime() call gsi_fv3ncdf_read(grd_fv3lam_ens_dynvar_io_nouv,gsibundle_fv3lam_ens_dynvar_nouv,& fv3_filenameginput%dynvars,fv3_filenameginput,dual_res) call gsi_fv3ncdf_read(grd_fv3lam_ens_tracer_io_nouv,gsibundle_fv3lam_ens_tracer_nouv,& fv3_filenameginput%tracers,fv3_filenameginput,dual_res) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("general_read_fv3_regional: Maximum Walltime for gsi_fv3ncdf_read" f15.4)') walltime + if( if_model_dbz .or. if_model_fed ) then call gsi_fv3ncdf_read(grd_fv3lam_ens_phyvar_io_nouv,gsibundle_fv3lam_ens_phyvar_nouv,& fv3_filenameginput%phyvars,fv3_filenameginput,dual_res) @@ -1013,6 +1044,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g endif !! tsen2tv !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + !$omp parallel do default(none),private(k,i,j) & + !$omp shared(grd_ens,g_q,g_tsen,g_tv,fv) do k=1,grd_ens%nsig do j=1,grd_ens%lon2 do i=1,grd_ens%lat2 @@ -1023,6 +1056,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g if (.not.q_hyb_ens) then ice=.true. iderivative=0 + !$omp parallel do default(none),private(k,i,j,kp) & + !$omp shared(grd_ens,g_prsi,g_prsl) do k=1,grd_ens%nsig kp=k+1 do j=1,grd_ens%lon2 @@ -1032,6 +1067,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g end do end do call genqsat(g_rh,g_tsen(1,1,1),g_prsl(1,1,1),grd_ens%lat2,grd_ens%lon2,grd_ens%nsig,ice,iderivative) + !$omp parallel do default(none),private(k,i,j) & + !$omp shared(grd_ens,g_rh,g_q) do k=1,grd_ens%nsig do j=1,grd_ens%lon2 do i=1,grd_ens%lat2 @@ -1051,6 +1088,8 @@ subroutine general_read_fv3_regional(this,fv3_filenameginput,g_ps,g_u,g_v,g_tv,g ! CV transform + !$omp parallel do default(none),private(k,i,j) & + !$omp shared(grd_ens,l_use_cvpqx,g_qr,cvpqx_pval,g_qs,g_qg,g_qnr,cld_nt_updt,l_cvpnr,cvpnr_pval) do k=1,grd_ens%nsig do i=1,grd_ens%lon2 do j=1,grd_ens%lat2 diff --git a/src/gsi/crc32.F90 b/src/gsi/crc32.F90 new file mode 100644 index 000000000..dd22e4c80 --- /dev/null +++ b/src/gsi/crc32.F90 @@ -0,0 +1,23 @@ +module crc32 + use iso_c_binding + implicit none + public :: digest + + interface + integer function digest_c(message) bind(c) + use iso_c_binding, only: c_char + character(kind=c_char), intent(in) :: message(*) + end function digest_c + end interface + + contains + + integer function digest(m) + use iso_c_binding, only: c_null_char + implicit none + character(len=*), intent(in) :: m + !m='nid001019' + digest=abs(digest_c(trim(m)//c_null_char)) + !write(6,'("Digest ",I12)') digest + end function digest +end module crc32 diff --git a/src/gsi/crc32_c.c b/src/gsi/crc32_c.c new file mode 100644 index 000000000..049a725ba --- /dev/null +++ b/src/gsi/crc32_c.c @@ -0,0 +1,7 @@ +#include +#include +#include + +uLong digest_c(char * message) { + return crc32(0, (const void*)message, strlen(message)); +} diff --git a/src/gsi/get_gefs_for_regional.f90 b/src/gsi/get_gefs_for_regional.f90 index cc5e0a2c8..196ad2ec8 100644 --- a/src/gsi/get_gefs_for_regional.f90 +++ b/src/gsi/get_gefs_for_regional.f90 @@ -83,6 +83,7 @@ subroutine get_gefs_for_regional use get_wrf_mass_ensperts_mod, only: get_wrf_mass_ensperts_class use gsi_io, only: verbose use obsmod, only: l_wcp_cwm + use mpi, only : MPI_Wtime, MPI_REAL8, MPI_SUCCESS implicit none type(sub2grid_info) grd_gfs,grd_mix,grd_gfst @@ -187,6 +188,8 @@ subroutine get_gefs_for_regional real(r_kind), pointer :: ges_q (:,:,:)=>NULL() logical :: print_verbose real(r_kind), allocatable :: ges_z_ens(:,:) + real(kind=8) :: time_beg,time_end,walltime, tb,te,wt + integer(i_kind) :: ierr print_verbose=.false. if(verbose)print_verbose=.true. @@ -621,6 +624,7 @@ subroutine get_gefs_for_regional rewind(10) inithead=.true. + tb=MPI_Wtime() do n=1,n_ens_gfs read(10,'(a)',err=20,end=20)filename filename=trim(ensemble_path) // trim(filename) @@ -641,8 +645,13 @@ subroutine get_gefs_for_regional call general_read_gfsatm_nems(grd_gfst,sp_gfs,filename,uv_hyb_ens,.false.,.true., & atm_bundle,.true.,iret) else if (use_gfs_ncio) then + time_beg=MPI_Wtime() call general_read_gfsatm_nc(grd_gfst,sp_gfs,filename,uv_hyb_ens,.false.,.true., & atm_bundle,.true.,iret) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime for general_read_gfsatm_nc" f15.4)') walltime else call general_read_gfsatm(grd_gfst,sp_gfs,sp_gfs,filename,uv_hyb_ens,.false.,.true., & atm_bundle,inithead,iret) @@ -955,6 +964,10 @@ subroutine get_gefs_for_regional ! if(mype==0) write(6,*)' with halo, n,min,max ges_ps - matt ps =',n,pdiffmin0,pdiffmax0 end do ! end loop over ensemble members. + te=MPI_Wtime() + call MPI_Reduce(te-tb, wt, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime to read ",I4," ensemble members ", f15.4)') n_ens_gfs,wt deallocate(ges_z_ens) diff --git a/src/gsi/gsi_files.cmake b/src/gsi/gsi_files.cmake index 95d885e2e..b7fb9e623 100644 --- a/src/gsi/gsi_files.cmake +++ b/src/gsi/gsi_files.cmake @@ -1,5 +1,6 @@ list(APPEND GSI_SRC_C blockIO.c +crc32_c.c ) # Class files for WRF interfaces @@ -264,6 +265,7 @@ guess_grids.F90 half_nmm_grid2.f90 hdraobmod.f90 hilbert_curve.f90 +crc32.F90 hybrid_ensemble_isotropic.F90 hybrid_ensemble_parameters.f90 inc2guess.f90 diff --git a/src/gsi/hybrid_ensemble_isotropic.F90 b/src/gsi/hybrid_ensemble_isotropic.F90 index 87f3605ea..d1c8a7211 100644 --- a/src/gsi/hybrid_ensemble_isotropic.F90 +++ b/src/gsi/hybrid_ensemble_isotropic.F90 @@ -732,6 +732,8 @@ subroutine new_factorization_rf_y(f,iadvance,iback,nlevs,ig) nx=grd_loc%nlon ; ny=grd_loc%nlat ; nz=nlevs if(vvlocal)then +!$omp parallel do default(none), schedule(static,1) private(k,j,i,l) & +!$omp shared(nx,ny,nz,ig,iadvance,ynorm_new,f,fmaty,fmat0y,iback) do k=1,nz do j=1,nx @@ -764,6 +766,8 @@ subroutine new_factorization_rf_y(f,iadvance,iback,nlevs,ig) enddo enddo else +!$omp parallel do default(none),schedule(static,1) private(k,j,i,l) & +!$omp shared(nx,ny,nz,ig,iadvance,ynorm_new,f,fmaty,fmat0y,iback) do k=1,nz do j=1,nx @@ -908,12 +912,22 @@ subroutine normal_new_factorization_rf_x use hybrid_ensemble_parameters, only: grd_loc,vvlocal use hybrid_ensemble_parameters, only: naensgrp,naensloc use constants, only: zero,one + use mpimod, only: mpi_rtype + use mpi + use crc32 implicit none - integer(i_kind) i,j,k,iadvance,iback,kl,ig + integer(i_kind) i,j,k,iadvance,iback,kl,ig,npes,inunit + integer(i_kind) name_len,message_len,nodeID,nodeComm,nodeRank,RanksPerNode,ierr real(r_kind) f(grd_loc%nlat,grd_loc%nlon,grd_loc%kend_alloc+1-grd_loc%kbegin_loc) real(r_kind),allocatable:: diag(:,:,:) + real(i_kind),allocatable:: sz(:) + character(len=72) :: input + character(len=5) :: np,nlat,nlon + logical :: exists + character(len=MPI_MAX_PROCESSOR_NAME) :: nodeName + character(len=MPI_MAX_ERROR_STRING) :: message ! real(r_kind) diag(grd_loc%nlat,grd_loc%nlon) ! possible to have kend_loc - kbegin_loc-1 for processors not involved @@ -926,59 +940,171 @@ subroutine normal_new_factorization_rf_x else kl=1 endif - if(allocated(xnorm_new)) deallocate(xnorm_new) - allocate(xnorm_new(grd_loc%nlat,grd_loc%nlon,kl,naensloc)) - if(allocated(diag)) deallocate(diag) - allocate(diag(grd_loc%nlat,grd_loc%nlon,kl)) - xnorm_new=one - do ig=1,naensgrp - do j=1,grd_loc%nlon - f=zero - do k=1,kl - do i=1,grd_loc%nlat - f(i,j,k)=one - enddo - enddo - iadvance=1 ; iback=2 - call new_factorization_rf_x(f,iadvance,iback,kl,ig) - iadvance=2 ; iback=1 - call new_factorization_rf_x(f,iadvance,iback,kl,ig) - do k=1,kl - do i=1,grd_loc%nlat - diag(i,j,k)=sqrt(one/f(i,j,k)) - enddo - enddo - enddo - do k=1,kl - do j=1,grd_loc%nlon - do i=1,grd_loc%nlat - xnorm_new(i,j,k,ig)=diag(i,j,k) - enddo - enddo - enddo - enddo !ig loop +! The Optimization depends on each rank on a node having the same sized xnorm_new array. +! This is generally not true when vvlocal=.true. (readin_localization in gsiparm.anl) because kl varies in that case + call MPI_Comm_size(MPI_COMM_WORLD,npes,ierr) + allocate(sz(npes)) + call MPI_Allgather((grd_loc%nlat*grd_loc%nlon*kl*naensloc),1,MPI_Integer,sz,1,MPI_Integer,MPI_COMM_WORLD,ierr) + + if(vvlocal .or. any(sz /= sz(1))) then + ! Must generate xnorm_new on all ranks + if (mype==0) write(6,'("new_factorization_rf_x: Default")') + if(allocated(xnorm_new)) deallocate(xnorm_new) + allocate(xnorm_new(grd_loc%nlat,grd_loc%nlon,kl,naensloc)) + !write(6,'("new_factorization_rf_x: Default ",5I6)'),mype,grd_loc%nlat,grd_loc%nlon,kl,naensloc + if(allocated(diag)) deallocate(diag) + allocate(diag(grd_loc%nlat,grd_loc%nlon,kl)) + xnorm_new=one + + do ig=1,naensgrp + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=sqrt(one/f(i,j,k)) + enddo + enddo + enddo + do k=1,kl + do j=1,grd_loc%nlon + do i=1,grd_loc%nlat + xnorm_new(i,j,k,ig)=diag(i,j,k) + enddo + enddo + enddo + enddo !ig loop ! check accuracy of xnorm - if(debug) then - do j=1,grd_loc%nlon - f=zero - do k=1,kl - do i=1,grd_loc%nlat - f(i,j,k)=one + if(debug) then + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=f(i,j,k) + enddo + enddo + enddo + write(6,*)' in normal_new_factorization_rf_x,min,max(diag)=',minval(diag),maxval(diag) + endif + elseif (any(sz == sz(1))) then ! Use optimization + if(mype==0) write(6,'("new_factorization_rf_x: Opt")') + !write(6,'("new_factorization_rf_x: Opt ",5I6)'),mype,grd_loc%nlat,grd_loc%nlon,kl,naensloc + ! Fill xnorm_new using one rank per compute node to avoid + ! contension for memory bandwidth + call MPI_Get_processor_name(nodeName,name_len,ierr) + nodeID=digest(trim(nodeName)) + !read(nodeName(4:9),*) nodeID ! Danger! This approach will not to work on platforms other than WCOSS2 + call MPI_Comm_split(mpi_comm_world, nodeID, mype, nodeComm, ierr) + call MPI_Comm_rank(nodeComm,nodeRank,ierr) + call MPI_Comm_size(nodeComm,RanksPerNode,ierr) + + if(allocated(xnorm_new)) deallocate(xnorm_new) + allocate(xnorm_new(grd_loc%nlat,grd_loc%nlon,kl,naensloc)) + + if(nodeRank==0) then + ! Check for existing file + write(nlat, '(i0)') grd_loc%nlat; nlat = adjustl(nlat) + write(nlon, '(i0)') grd_loc%nlon; nlon = adjustl(nlon) + write(np, '(i0)') npes; np = adjustl(np) + input= 'xnorm_new.'//trim(np)//'.'//trim(nlat)//'.'//trim(nlon) + inquire (file=trim(input), EXIST=exists) + if (exists) then + inunit=2000+mype + open(inunit,file=trim(input),form='unformatted',action='read') + read(inunit) xnorm_new + close(inunit) + else ! Generate the data and have rank 0 write it out for later use + if(allocated(diag)) deallocate(diag) + allocate(diag(grd_loc%nlat,grd_loc%nlon,kl)) + xnorm_new=one + + do ig=1,naensgrp + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,ig) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=sqrt(one/f(i,j,k)) + enddo + enddo enddo - enddo - iadvance=1 ; iback=2 - call new_factorization_rf_x(f,iadvance,iback,kl,1) - iadvance=2 ; iback=1 - call new_factorization_rf_x(f,iadvance,iback,kl,1) - do k=1,kl - do i=1,grd_loc%nlat - diag(i,j,k)=f(i,j,k) + do k=1,kl + do j=1,grd_loc%nlon + do i=1,grd_loc%nlat + xnorm_new(i,j,k,ig)=diag(i,j,k) + enddo + enddo enddo - enddo - enddo - write(6,*)' in normal_new_factorization_rf_x,min,max(diag)=',minval(diag),maxval(diag) + enddo !ig loop +! check accuracy of xnorm + if(debug) then + do j=1,grd_loc%nlon + f=zero + do k=1,kl + do i=1,grd_loc%nlat + f(i,j,k)=one + enddo + enddo + iadvance=1 ; iback=2 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + iadvance=2 ; iback=1 + call new_factorization_rf_x(f,iadvance,iback,kl,1) + do k=1,kl + do i=1,grd_loc%nlat + diag(i,j,k)=f(i,j,k) + enddo + enddo + enddo + write(6,*)' in normal_new_factorization_rf_x,min,max(diag)=',minval(diag),maxval(diag) + endif + ! File didnt exist so we computed the data. Now save it for subsequent runs. + if(mype==0) then + open(inunit,file=trim(input),form='unformatted',action='write') + write(inunit) xnorm_new + close(inunit) + endif + endif ! file exists + endif ! nodeRank==0 + +! Broadcast xnorm_new to the other ranks on the same node + call MPI_Bcast(xnorm_new,size(xnorm_new),mpi_rtype,0,nodeComm,ierr) + if(ierr /= MPI_SUCCESS) then + call MPI_Error_string(ierr, message, message_len, ierr) + write(6,'("new_factorization_rf_x: Error from MPI_Bcast ",I4,A)') mype, trim(message) + call MPI_Abort(MPI_COMM_WORLD, 10, ierr) + endif + call MPI_Comm_free(nodeComm,ierr) + else + write(6,'("new_factorization_rf_x: Scenario not yet supported")') + call MPI_Abort(MPI_COMM_WORLD, 13, ierr) endif + return end subroutine normal_new_factorization_rf_x diff --git a/src/gsi/mod_fv3_lola.f90 b/src/gsi/mod_fv3_lola.f90 index 11bb3b6e3..b01be9fbb 100644 --- a/src/gsi/mod_fv3_lola.f90 +++ b/src/gsi/mod_fv3_lola.f90 @@ -128,8 +128,12 @@ subroutine generate_anl_grid(nx,ny,grid_lon,grid_lont,grid_lat,grid_latt) use gridmod, only:grid_ratio_fv3_regional, region_lat,region_lon,nlat,nlon use gridmod, only: region_dy,region_dx,region_dyi,region_dxi,coeffy,coeffx use gridmod, only:init_general_transform,region_dy,region_dx - use mpimod, only: mype + use mpimod, only: mype,mpi_rtype use egrid2agrid_mod, only: egrid2agrid_parm + use mpi + use crc32 + use ifcore + use ifport implicit none real(r_kind),allocatable,dimension(:)::xbh_a,xa_a,xa_b @@ -164,431 +168,641 @@ subroutine generate_anl_grid(nx,ny,grid_lon,grid_lont,grid_lat,grid_latt) real(r_kind) d(4),ds integer(i_kind) kk,k + integer(i_kind) name_len,nodeID,nodeComm,nodeRank,RanksPerNode,ierr,npes,inunit + character(len=72) :: input + character(len=5) :: np,nlatc,nlonc + logical :: res,exists + character(len=MPI_MAX_PROCESSOR_NAME) :: nodeName + !real(kind=8) :: time_beg,time_end,walltime + nord_e2a=4 bilinear=.false. + call MPI_Comm_size(mpi_comm_world,npes,ierr) + call MPI_Get_processor_name(nodeName,name_len,ierr) + nodeID=digest(trim(nodeName)) + !read(nodeName(4:9),*) nodeID ! Danger! This approach will not to work on platforms other than WCOSS2 + call MPI_Comm_split(mpi_comm_world, nodeID, mype, nodeComm, ierr) + call MPI_Comm_rank(nodeComm,nodeRank,ierr) + call MPI_Comm_size(nodeComm,RanksPerNode,ierr) + + ! Check for existing file + write(nlatc, '(i0)') nx; nlatc = adjustl(nlatc) + write(nlonc, '(i0)') ny; nlonc = adjustl(nlonc) + write(np, '(i0)') npes; np = adjustl(np) + input= 'anl_grid.'//trim(np)//'.'//trim(nlatc)//'.'//trim(nlonc) + inquire (file=trim(input), EXIST=exists) + + if(nodeRank==0) then + if (exists) then + !write(6,'("generate_anl_grid: Reading from ana_grid ",I4,2A)') mype,' ',trim(input) + inunit=2000+mype + open(inunit,file=trim(input),form='unformatted',action='read') + read(inunit) nlat,nlon,nxa,nya + if (allocated(region_dx )) deallocate(region_dx ) + if (allocated(region_dy )) deallocate(region_dy ) + allocate( region_dx(nlat,nlon), region_dy(nlat,nlon)) + allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) + allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) + if (allocated(region_lat)) deallocate(region_lat) + if (allocated(region_lon)) deallocate(region_lon) + allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) + allocate(fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) + allocate(fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) + allocate( a3dx( ny, nx),a3dx1(ny,nx),a3dy(ny,nx),a3dy1(ny,nx) ) + allocate( a3ix( ny, nx),a3ixp(ny,nx),a3jy(ny,nx),a3jyp(ny,nx) ) + allocate(cangu( nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) + + rewind(inunit) + !time_beg=MPI_Wtime() + read(inunit) nlat,nlon,nxa,nya, & + region_dx,region_dy,region_dxi,region_dyi, & + coeffx,coeffy,region_lat,region_lon, & + fv3dx,fv3dx1,fv3dy,fv3dy1,fv3ix,fv3ixp,fv3jy,fv3jyp, & + a3dx,a3dx1,a3dy,a3dy1,a3ix,a3ixp,a3jy,a3jyp, & + cangu,sangu,cangv,sangv + !time_end=MPI_Wtime() + !call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, nodeComm, ierr) + !if(mype==0) write(6,'("Maximum Walltime to read anl_grid " f15.4)') walltime + + close(inunit) + else ! create xc,yc,zc for the cell centers. - allocate(xc(nx,ny)) - allocate(yc(nx,ny)) - allocate(zc(nx,ny)) - allocate(gclat(nx,ny)) - allocate(gclon(nx,ny)) - allocate(gcrlat(nx,ny)) - allocate(gcrlon(nx,ny)) - do j=1,ny - do i=1,nx - xc(i,j)=cos(grid_latt(i,j)*deg2rad)*cos(grid_lont(i,j)*deg2rad) - yc(i,j)=cos(grid_latt(i,j)*deg2rad)*sin(grid_lont(i,j)*deg2rad) - zc(i,j)=sin(grid_latt(i,j)*deg2rad) - enddo - enddo + allocate(xc(nx,ny)) + allocate(yc(nx,ny)) + allocate(zc(nx,ny)) + allocate(gclat(nx,ny)) + allocate(gclon(nx,ny)) + allocate(gcrlat(nx,ny)) + allocate(gcrlon(nx,ny)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(ny,nx,xc,grid_latt,grid_lont,deg2rad,yc,zc) + do j=1,ny + do i=1,nx + xc(i,j)=cos(grid_latt(i,j)*deg2rad)*cos(grid_lont(i,j)*deg2rad) + yc(i,j)=cos(grid_latt(i,j)*deg2rad)*sin(grid_lont(i,j)*deg2rad) + zc(i,j)=sin(grid_latt(i,j)*deg2rad) + enddo + enddo + !$omp end parallel do ! compute center as average x,y,z coordinates of corners of domain -- - xcent=quarter*(xc(1,1)+xc(1,ny)+xc(nx,1)+xc(nx,ny)) - ycent=quarter*(yc(1,1)+yc(1,ny)+yc(nx,1)+yc(nx,ny)) - zcent=quarter*(zc(1,1)+zc(1,ny)+zc(nx,1)+zc(nx,ny)) + xcent=quarter*(xc(1,1)+xc(1,ny)+xc(nx,1)+xc(nx,ny)) + ycent=quarter*(yc(1,1)+yc(1,ny)+yc(nx,1)+yc(nx,ny)) + zcent=quarter*(zc(1,1)+zc(1,ny)+zc(nx,1)+zc(nx,ny)) - rnorm=one/sqrt(xcent**2+ycent**2+zcent**2) - xcent=rnorm*xcent - ycent=rnorm*ycent - zcent=rnorm*zcent - centlat=asin(zcent)*rad2deg - centlon=atan2(ycent,xcent)*rad2deg + rnorm=one/sqrt(xcent**2+ycent**2+zcent**2) + xcent=rnorm*xcent + ycent=rnorm*ycent + zcent=rnorm*zcent + centlat=asin(zcent)*rad2deg + centlon=atan2(ycent,xcent)*rad2deg !! compute new lats, lons - call rotate2deg(grid_lont,grid_latt,gcrlon,gcrlat, & - centlon,centlat,nx,ny) + call rotate2deg(grid_lont,grid_latt,gcrlon,gcrlat, & + centlon,centlat,nx,ny) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !! compute analysis A-grid lats, lons !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !--------------------------obtain analysis grid dimensions nxa,nya - nxa=1+nint((nx-one)/grid_ratio_fv3_regional) - nya=1+nint((ny-one)/grid_ratio_fv3_regional) - nlat=nya - nlon=nxa - if(mype==0) print *,'nlat,nlon=nya,nxa= ',nlat,nlon + nxa=1+nint((nx-one)/grid_ratio_fv3_regional) + nya=1+nint((ny-one)/grid_ratio_fv3_regional) + nlat=nya + nlon=nxa + if(mype==0) print *,'nlat,nlon=nya,nxa= ',nlat,nlon !--------------------------obtain analysis grid spacing - dlat=(maxval(gcrlat)-minval(gcrlat))/(ny-1) - dlon=(maxval(gcrlon)-minval(gcrlon))/(nx-1) - adlat=dlat*grid_ratio_fv3_regional - adlon=dlon*grid_ratio_fv3_regional + dlat=(maxval(gcrlat)-minval(gcrlat))/(ny-1) + dlon=(maxval(gcrlon)-minval(gcrlon))/(nx-1) + adlat=dlat*grid_ratio_fv3_regional + adlon=dlon*grid_ratio_fv3_regional !-------setup analysis A-grid; find center of the domain - nlonh=nlon/2 - nlath=nlat/2 - - if(nlonh*2==nlon)then - clon=adlon/two - cx=half - else - clon=adlon - cx=one - endif - - if(nlath*2==nlat)then - clat=adlat/two - cy=half - else - clat=adlat - cy=one - endif + nlonh=nlon/2 + nlath=nlat/2 + + if(nlonh*2==nlon)then + clon=adlon/two + cx=half + else + clon=adlon + cx=one + endif + + if(nlath*2==nlat)then + clat=adlat/two + cy=half + else + clat=adlat + cy=one + endif ! !-----setup analysis A-grid from center of the domain ! - allocate(rlat_in(nlat,nlon),rlon_in(nlat,nlon)) - do j=1,nlon - alon=(j-nlonh)*adlon-clon - do i=1,nlat - rlon_in(i,j)=alon - enddo - enddo - - - do j=1,nlon - do i=1,nlat - rlat_in(i,j)=(i-nlath)*adlat-clat - enddo - enddo - - if (allocated(region_dx )) deallocate(region_dx ) - if (allocated(region_dy )) deallocate(region_dy ) - allocate(region_dx(nlat,nlon),region_dy(nlat,nlon)) - allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) - allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) - dyy=rearth*adlat*deg2rad - dyyi=one/dyy - dyyh=half/dyy - do j=1,nlon - do i=1,nlat - region_dy(i,j)=dyy - region_dyi(i,j)=dyyi - coeffy(i,j)=dyyh - enddo - enddo - - do i=1,nlat - dxx=rearth*cos(rlat_in(i,1)*deg2rad)*adlon*deg2rad - dxxi=one/dxx - dxxh=half/dxx - do j=1,nlon - region_dx(i,j)=dxx - region_dxi(i,j)=dxxi - coeffx(i,j)=dxxh - enddo - enddo + allocate(rlat_in(nlat,nlon),rlon_in(nlat,nlon)) + !$omp parallel do default(none),private(j,alon,i) & + !$omp shared(nlon,nlat,rlon_in,nlonh,adlon,clon) + do j=1,nlon + alon=(j-nlonh)*adlon-clon + do i=1,nlat + rlon_in(i,j)=alon + enddo + enddo + !$omp end parallel do + + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,rlat_in,nlath,adlat,clat) + do j=1,nlon + do i=1,nlat + rlat_in(i,j)=(i-nlath)*adlat-clat + enddo + enddo + !$omp end parallel do + + if (allocated(region_dx )) deallocate(region_dx ) + if (allocated(region_dy )) deallocate(region_dy ) + allocate(region_dx(nlat,nlon),region_dy(nlat,nlon)) + allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) + allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) + dyy=rearth*adlat*deg2rad + dyyi=one/dyy + dyyh=half/dyy + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,dyy,dyyi,dyyh,region_dy,region_dyi,coeffy) + do j=1,nlon + do i=1,nlat + region_dy(i,j)=dyy + region_dyi(i,j)=dyyi + coeffy(i,j)=dyyh + enddo + enddo + !$omp end parallel do + + !$omp parallel do default(none),private(j,i,dxx,dxxi,dxxh) & + !$omp shared(nlon,nlat,rearth,rlat_in,deg2rad,adlon,region_dx,region_dxi,coeffx) + do i=1,nlat + dxx=rearth*cos(rlat_in(i,1)*deg2rad)*adlon*deg2rad + dxxi=one/dxx + dxxh=half/dxx + do j=1,nlon + region_dx(i,j)=dxx + region_dxi(i,j)=dxxi + coeffx(i,j)=dxxh + enddo + enddo + !$omp end parallel do ! !---------- setup region_lat,region_lon in earth coord ! - if (allocated(region_lat)) deallocate(region_lat) - if (allocated(region_lon)) deallocate(region_lon) - allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) - allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) + if (allocated(region_lat)) deallocate(region_lat) + if (allocated(region_lon)) deallocate(region_lon) + allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) + allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) - call unrotate2deg(region_lon,region_lat,rlon_in,rlat_in, & + call unrotate2deg(region_lon,region_lat,rlon_in,rlat_in, & centlon,centlat,nlat,nlon) - region_lat=region_lat*deg2rad - region_lon=region_lon*deg2rad - - do j=1,nlat - do i=1,nlon - glat_an(i,j)=region_lat(j,i) - glon_an(i,j)=region_lon(j,i) - enddo - enddo - - call init_general_transform(glat_an,glon_an) + region_lat=region_lat*deg2rad + region_lon=region_lon*deg2rad + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,region_lat,region_lon,glat_an,glon_an) + do j=1,nlat + do i=1,nlon + glat_an(i,j)=region_lat(j,i) + glon_an(i,j)=region_lon(j,i) + enddo + enddo + !$omp end parallel do + + call init_general_transform(glat_an,glon_an) - deallocate(glat_an,glon_an) + deallocate(glat_an,glon_an) !--------------------compute all combinations of relative coordinates - allocate(xbh_a(nx),xbh_b(nx,ny),xa_a(nxa),xa_b(nxa)) - allocate(ybh_a(ny),ybh_b(nx,ny),ya_a(nya),ya_b(nya)) + allocate(xbh_a(nx),xbh_b(nx,ny),xa_a(nxa),xa_b(nxa)) + allocate(ybh_a(ny),ybh_b(nx,ny),ya_a(nya),ya_b(nya)) - nxh=nx/2 - nyh=ny/2 + nxh=nx/2 + nyh=ny/2 !!!!!! fv3 rotated grid; not equal spacing, non_orthogonal !!!!!! - do j=1,ny - jr=ny+1-j - do i=1,nx - ir=nx+1-i - xbh_b(ir,jr)=gcrlon(i,j)/dlon - end do - end do - do j=1,ny - jr=ny+1-j - do i=1,nx - ir=nx+1-i - ybh_b(ir,jr)=gcrlat(i,j)/dlat - end do - end do + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(ny,nx,xbh_b,gcrlon,dlon) + do j=1,ny + jr=ny+1-j + do i=1,nx + ir=nx+1-i + xbh_b(ir,jr)=gcrlon(i,j)/dlon + end do + end do + !$omp end parallel do + + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(ny,nx,ybh_b,gcrlat,dlat) + do j=1,ny + jr=ny+1-j + do i=1,nx + ir=nx+1-i + ybh_b(ir,jr)=gcrlat(i,j)/dlat + end do + end do + !$omp end parallel do !!!! define analysis A grid !!!!!!!!!!!!! - do j=1,nxa - xa_a(j)=(real(j-nlonh,r_kind)-cx)*grid_ratio_fv3_regional - end do - do i=1,nya - ya_a(i)=(real(i-nlath,r_kind)-cy)*grid_ratio_fv3_regional - end do + do j=1,nxa + xa_a(j)=(real(j-nlonh,r_kind)-cx)*grid_ratio_fv3_regional + end do + do i=1,nya + ya_a(i)=(real(i-nlath,r_kind)-cy)*grid_ratio_fv3_regional + end do !!!!!compute fv3 to A grid interpolation parameters !!!!!!!!! - allocate ( fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) - allocate ( fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) - allocate(yy(ny)) + allocate ( fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) + allocate ( fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) + allocate(yy(ny)) ! iteration to find the fv3 grid cell - jb1=1 - ib1=1 - do j=1,nya - do i=1,nxa - do n=1,3 - gxa=xa_a(i) - if(gxa < xbh_b(1,jb1))then - gxa= 1 - else if(gxa > xbh_b(nx,jb1))then - gxa= nx - else - call grdcrd1(gxa,xbh_b(1,jb1),nx,1) - endif - ib2=ib1 - ib1=gxa - do jj=1,ny - yy(jj)=ybh_b(ib1,jj) - enddo - gya=ya_a(j) - if(gya < yy(1))then - gya= 1 - else if(gya > yy(ny))then - gya= ny - else - call grdcrd1(gya,yy,ny,1) - endif - jb2=jb1 - jb1=gya - if(ib1+1 > nx)then !this block( 6 lines) is copied from GSL gsi repository - ib1=ib1-1 - endif - if(jb1+1 > ny)then - jb1=jb1-1 - endif - - - if((ib1 == ib2) .and. (jb1 == jb2)) exit - if(n==3 ) then + jb1=1 + ib1=1 + !$omp parallel do default(none),private(j,i,n,gxa,ib2,jj,yy,gya,jb2,d,kk,k,ds) & + !$omp shared(nya,nxa,xa_a,xbh_b,nx,ybh_b,ya_a,ny,fv3ix,fv3ixp,fv3jy,fv3jyp,fv3dy,fv3dy1,fv3dx,fv3dx1,bilinear) & + !$omp firstprivate(jb1,ib1) + do j=1,nya + do i=1,nxa + do n=1,3 + gxa=xa_a(i) + if(gxa < xbh_b(1,jb1))then + gxa= 1 + else if(gxa > xbh_b(nx,jb1))then + gxa= nx + else + call grdcrd1(gxa,xbh_b(1,jb1),nx,1) + endif + ib2=ib1 + ib1=gxa + do jj=1,ny + yy(jj)=ybh_b(ib1,jj) + enddo + gya=ya_a(j) + if(gya < yy(1))then + gya= 1 + else if(gya > yy(ny))then + gya= ny + else + call grdcrd1(gya,yy,ny,1) + endif + jb2=jb1 + jb1=gya + if(ib1+1 > nx)then !this block( 6 lines) is copied from GSL gsi repository + ib1=ib1-1 + endif + if(jb1+1 > ny)then + jb1=jb1-1 + endif + + + if((ib1 == ib2) .and. (jb1 == jb2)) exit + if(n==3 ) then !!!!!!! if not converge, find the nearest corner point - d(1)=(xa_a(i)-xbh_b(ib1,jb1))**2+(ya_a(j)-ybh_b(ib1,jb1))**2 - d(2)=(xa_a(i)-xbh_b(ib1+1,jb1))**2+(ya_a(j)-ybh_b(ib1+1,jb1))**2 - d(3)=(xa_a(i)-xbh_b(ib1,jb1+1))**2+(ya_a(j)-ybh_b(ib1,jb1+1))**2 - d(4)=(xa_a(i)-xbh_b(ib1+1,jb1+1))**2+(ya_a(j)-ybh_b(ib1+1,jb1+1))**2 - kk=1 - do k=2,4 - if(d(k) xa_a(nxa))then - gxa= nxa - else - call grdcrd1(gxa,xa_a,nxa,1) - endif - a3ix(j,i)=int(gxa) - a3ix(j,i)=min(max(1,a3ix(j,i)),nxa) - a3dx(j,i)=max(zero,min(one,gxa-a3ix(j,i))) - a3dx1(j,i)=one-a3dx(j,i) - a3ixp(j,i)=min(nxa,a3ix(j,i)+1) - end do - end do - - do i=1,nx - do j=1,ny - gya=ybh_b(i,j) - if(gya < ya_a(1))then - gya= 1 - else if(gya > ya_a(nya))then - gya= nya - else - call grdcrd1(gya,ya_a,nya,1) - endif - a3jy(j,i)=int(gya) - a3jy(j,i)=min(max(1,a3jy(j,i)),nya) - a3dy(j,i)=max(zero,min(one,gya-a3jy(j,i))) - a3dy1(j,i)=one-a3dy(j,i) - a3jyp(j,i)=min(nya,a3jy(j,i)+1) - end do - end do + allocate ( a3dx(ny,nx),a3dx1(ny,nx),a3dy(ny,nx),a3dy1(ny,nx) ) + allocate ( a3ix(ny,nx),a3ixp(ny,nx),a3jy(ny,nx),a3jyp(ny,nx) ) + !$omp parallel do default(none),private(j,i,gxa) & + !$omp shared(ny,nx,xbh_b,xa_a,nxa,a3ix,a3dx,a3dx1,a3ixp) + do i=1,nx + do j=1,ny + gxa=xbh_b(i,j) + if(gxa < xa_a(1))then + gxa= 1 + else if(gxa > xa_a(nxa))then + gxa= nxa + else + call grdcrd1(gxa,xa_a,nxa,1) + endif + a3ix(j,i)=int(gxa) + a3ix(j,i)=min(max(1,a3ix(j,i)),nxa) + a3dx(j,i)=max(zero,min(one,gxa-a3ix(j,i))) + a3dx1(j,i)=one-a3dx(j,i) + a3ixp(j,i)=min(nxa,a3ix(j,i)+1) + end do + end do + !$omp end parallel do + + !$omp parallel do default(none),private(j,i,gya) & + !$omp shared(ny,nx,ybh_b,ya_a,nya,a3jy,a3dy,a3dy1,a3jyp) + do i=1,nx + do j=1,ny + gya=ybh_b(i,j) + if(gya < ya_a(1))then + gya= 1 + else if(gya > ya_a(nya))then + gya= nya + else + call grdcrd1(gya,ya_a,nya,1) + endif + a3jy(j,i)=int(gya) + a3jy(j,i)=min(max(1,a3jy(j,i)),nya) + a3dy(j,i)=max(zero,min(one,gya-a3jy(j,i))) + a3dy1(j,i)=one-a3dy(j,i) + a3jyp(j,i)=min(nya,a3jy(j,i)+1) + end do + end do + !$omp end parallel do !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!! find coefficients for wind conversion btw FV3 & earth !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - allocate ( cangu(nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) + allocate ( cangu(nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) ! 1. compute x,y,z at cell cornor from grid_lon, grid_lat - - do j=1,ny+1 - do i=1,nx+1 - x(i,j)=cos(grid_lat(i,j)*deg2rad)*cos(grid_lon(i,j)*deg2rad) - y(i,j)=cos(grid_lat(i,j)*deg2rad)*sin(grid_lon(i,j)*deg2rad) - z(i,j)=sin(grid_lat(i,j)*deg2rad) - enddo - enddo + !$omp parallel do default(none),private(j,i) & + !$omp shared(ny,nx,grid_lat,grid_lon,deg2rad,x,y,z) + do j=1,ny+1 + do i=1,nx+1 + x(i,j)=cos(grid_lat(i,j)*deg2rad)*cos(grid_lon(i,j)*deg2rad) + y(i,j)=cos(grid_lat(i,j)*deg2rad)*sin(grid_lon(i,j)*deg2rad) + z(i,j)=sin(grid_lat(i,j)*deg2rad) + enddo + enddo + !$omp end parallel do ! 2 find angles to E-W and N-S for U edges - sq180=180._r_kind**2 - do j=1,ny+1 - do i=1,nx -! center lat/lon of the edge - rlat=half*(grid_lat(i,j)+grid_lat(i+1,j)) - diff=(grid_lon(i,j)-grid_lon(i+1,j))**2 - if(diff < sq180)then - rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)) - else - rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)-360._r_kind) - endif + sq180=180._r_kind**2 + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xu,yu,zu,uval,ewval,nsval) & + !$omp shared(ny,nx,grid_lat,grid_lon,sq180,deg2rad,x,y,z,cangu,sangu) + do j=1,ny+1 + do i=1,nx +! center lat/lon of the edge + rlat=half*(grid_lat(i,j)+grid_lat(i+1,j)) + diff=(grid_lon(i,j)-grid_lon(i+1,j))**2 + if(diff < sq180)then + rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)) + else + rlon=half*(grid_lon(i,j)+grid_lon(i+1,j)-360._r_kind) + endif ! vector to center of the edge - xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) - yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) - zr=sin(rlat*deg2rad) + xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) + yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) + zr=sin(rlat*deg2rad) ! vector of the edge - xu= x(i+1,j)-x(i,j) - yu= y(i+1,j)-y(i,j) - zu= z(i+1,j)-z(i,j) + xu= x(i+1,j)-x(i,j) + yu= y(i+1,j)-y(i,j) + zu= z(i+1,j)-z(i,j) ! find angle with cross product - uval=sqrt((xu**2+yu**2+zu**2)) - ewval=sqrt((xr**2+yr**2)) - nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) - cangu(i,j)=(-yr*xu+xr*yu)/ewval/uval - sangu(i,j)=(-xr*zr*xu-zr*yr*yu+(xr*xr+yr*yr)*zu) / nsval/uval - enddo - enddo + uval=sqrt((xu**2+yu**2+zu**2)) + ewval=sqrt((xr**2+yr**2)) + nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) + cangu(i,j)=(-yr*xu+xr*yu)/ewval/uval + sangu(i,j)=(-xr*zr*xu-zr*yr*yu+(xr*xr+yr*yr)*zu) / nsval/uval + enddo + enddo + !$omp end parallel do ! 3 find angles to E-W and N-S for V edges - do j=1,ny - do i=1,nx+1 - rlat=half*(grid_lat(i,j)+grid_lat(i,j+1)) - diff=(grid_lon(i,j)-grid_lon(i,j+1))**2 - if(diff < sq180)then - rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)) - else - rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)-360._r_kind) - endif - xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) - yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) - zr=sin(rlat*deg2rad) - xv= x(i,j+1)-x(i,j) - yv= y(i,j+1)-y(i,j) - zv= z(i,j+1)-z(i,j) - vval=sqrt((xv**2+yv**2+zv**2)) - ewval=sqrt((xr**2+yr**2)) - nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) - cangv(i,j)=(-yr*xv+xr*yv)/ewval/vval - sangv(i,j)=(-xr*zr*xv-zr*yr*yv+(xr*xr+yr*yr)*zv) / nsval/vval - enddo - enddo - deallocate( xc,yc,zc,gclat,gclon,gcrlat,gcrlon) - deallocate(rlat_in,rlon_in) + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xv,yv,zv,vval,ewval,nsval) & + !$omp shared(ny,nx,grid_lat,grid_lon,sq180,deg2rad,x,y,z,cangv,sangv) + do j=1,ny + do i=1,nx+1 + rlat=half*(grid_lat(i,j)+grid_lat(i,j+1)) + diff=(grid_lon(i,j)-grid_lon(i,j+1))**2 + if(diff < sq180)then + rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)) + else + rlon=half*(grid_lon(i,j)+grid_lon(i,j+1)-360._r_kind) + endif + xr=cos(rlat*deg2rad)*cos(rlon*deg2rad) + yr=cos(rlat*deg2rad)*sin(rlon*deg2rad) + zr=sin(rlat*deg2rad) + xv= x(i,j+1)-x(i,j) + yv= y(i,j+1)-y(i,j) + zv= z(i,j+1)-z(i,j) + vval=sqrt((xv**2+yv**2+zv**2)) + ewval=sqrt((xr**2+yr**2)) + nsval=sqrt((xr*zr)**2+(zr*yr)**2+(xr*xr+yr*yr)**2) + cangv(i,j)=(-yr*xv+xr*yv)/ewval/vval + sangv(i,j)=(-xr*zr*xv-zr*yr*yv+(xr*xr+yr*yr)*zv) / nsval/vval + enddo + enddo + !$omp end parallel do + deallocate( xc,yc,zc,gclat,gclon,gcrlat,gcrlon) + deallocate(rlat_in,rlon_in) + ! File didnt exist so we computed the data. Now save it for subsequent runs. + if(mype==0) then + inunit=2000+mype + open(inunit,file=trim(input),form='unformatted',action='write') + write(inunit) nlat,nlon,nxa,nya, & + region_dx,region_dy,region_dxi,region_dyi, & + coeffx,coeffy,region_lat,region_lon, & + fv3dx,fv3dx1,fv3dy,fv3dy1,fv3ix,fv3ixp,fv3jy,fv3jyp, & + a3dx,a3dx1,a3dy,a3dy1,a3ix,a3ixp,a3jy,a3jyp, & + cangu,sangu,cangv,sangv + !call flush(inunit) + !res = COMMITQQ(inunit) + close(inunit) + !write(6,'("generate_anl_grid: Wrote ana_grid ",I4,A)') mype, trim(input) + endif + endif ! file exists + endif ! nodeRank==0 + + call MPI_Bcast( nxa,1,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast( nya,1,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(nlat,1,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(nlon,1,mpi_integer,0,nodeComm,ierr) + if(nodeRank/=0) then + if (allocated(region_dx )) deallocate(region_dx ) + if (allocated(region_dy )) deallocate(region_dy ) + allocate( region_dx(nlat,nlon), region_dy(nlat,nlon)) + allocate(region_dxi(nlat,nlon),region_dyi(nlat,nlon)) + allocate(coeffx(nlat,nlon),coeffy(nlat,nlon)) + if (allocated(region_lat)) deallocate(region_lat) + if (allocated(region_lon)) deallocate(region_lon) + allocate(region_lat(nlat,nlon),region_lon(nlat,nlon)) + allocate(fv3dx(nxa,nya),fv3dx1(nxa,nya),fv3dy(nxa,nya),fv3dy1(nxa,nya) ) + allocate(fv3ix(nxa,nya),fv3ixp(nxa,nya),fv3jy(nxa,nya),fv3jyp(nxa,nya) ) + allocate( a3dx( ny, nx),a3dx1(ny,nx),a3dy(ny,nx),a3dy1(ny,nx) ) + allocate( a3ix( ny, nx),a3ixp(ny,nx),a3jy(ny,nx),a3jyp(ny,nx) ) + allocate(cangu( nx,ny+1),sangu(nx,ny+1),cangv(nx+1,ny),sangv(nx+1,ny) ) + endif + + +! Broadcast arrays to the other ranks on the same node + !time_beg=MPI_Wtime() + call MPI_Bcast( region_dx,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast( region_dy,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(region_dxi,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(region_dyi,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast( coeffx,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast( coeffy,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + + call MPI_Bcast(region_lat,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(region_lon,nlat*nlon,mpi_rtype,0,nodeComm,ierr) + !walltime=MPI_Wtime()-time_beg + + if (exists) then + ! All ranks need to call init_general_transform eventually + allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,region_lat,region_lon,glat_an,glon_an) + do j=1,nlat + do i=1,nlon + glat_an(i,j)=region_lat(j,i) + glon_an(i,j)=region_lon(j,i) + enddo + enddo + call init_general_transform(glat_an,glon_an) + deallocate(glat_an,glon_an) + else + if(nodeRank/=0) then + ! nodeRank==0 ranks already called init_general_transform + ! above so just call from remaining ranks + allocate(glat_an(nlon,nlat),glon_an(nlon,nlat)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(nlon,nlat,region_lat,region_lon,glat_an,glon_an) + do j=1,nlat + do i=1,nlon + glat_an(i,j)=region_lat(j,i) + glon_an(i,j)=region_lon(j,i) + enddo + enddo + call init_general_transform(glat_an,glon_an) + deallocate(glat_an,glon_an) + endif + endif + + !time_beg=MPI_Wtime() + call MPI_Bcast(fv3dx ,nxa*nya,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(fv3dx1,nxa*nya,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(fv3dy ,nxa*nya,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(fv3dy1,nxa*nya,mpi_rtype,0,nodeComm,ierr) + + call MPI_Bcast(fv3ix ,nxa*nya,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(fv3ixp,nxa*nya,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(fv3jy ,nxa*nya,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(fv3jyp,nxa*nya,mpi_integer,0,nodeComm,ierr) + + call MPI_Bcast(a3dx ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(a3dx1 ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(a3dy ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(a3dy1 ,nx*ny ,mpi_rtype,0,nodeComm,ierr) + + call MPI_Bcast(a3ix ,nx*ny ,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(a3ixp ,nx*ny ,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(a3jy ,nx*ny ,mpi_integer,0,nodeComm,ierr) + call MPI_Bcast(a3jyp ,nx*ny ,mpi_integer,0,nodeComm,ierr) + + call MPI_Bcast(cangu ,nx*(ny+1),mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(sangu ,nx*(ny+1),mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(cangv ,(nx+1)*ny,mpi_rtype,0,nodeComm,ierr) + call MPI_Bcast(sangv ,(nx+1)*ny,mpi_rtype,0,nodeComm,ierr) + !time_end=MPI_Wtime() + !call MPI_Reduce(walltime+(time_end-time_beg), walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + !if(mype==0) write(6,'("Maximum Walltime to Bcast anl_grid " f15.4)') walltime + + call MPI_Comm_free(nodeComm,ierr) end subroutine generate_anl_grid subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_latt) @@ -667,6 +881,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l allocate(gclon(nxen,nyen)) allocate(gcrlat(nxen,nyen)) allocate(gcrlon(nxen,nyen)) + !$omp parallel do default(none),private(j,i) & + !$omp shared(nyen,nxen,xc,grid_latt,grid_lont,deg2rad,yc,zc) do j=1,nyen do i=1,nxen xc(i,j)=cos(grid_latt(i,j)*deg2rad)*cos(grid_lont(i,j)*deg2rad) @@ -724,6 +940,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l !!!!!! fv3 rotated grid; not equal spacing, non_orthogonal !!!!!! + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(nyen,nxen,xbh_b,gcrlon,dlon) do j=1,nyen jr=nyen+1-j do i=1,nxen @@ -731,6 +949,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l xbh_b(ir,jr)=gcrlon(i,j)/dlon end do end do + !$omp parallel do default(none),private(j,jr,i,ir) & + !$omp shared(nyen,nxen,ybh_b,gcrlat,dlat) do j=1,nyen jr=nyen+1-j do i=1,nxen @@ -757,6 +977,9 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l ! iteration to find the fv3 grid cell jb1=1 ib1=1 + !$omp parallel do default(none),private(j,i,n,gxa,ib2,jj,yy,gya,jb2,d,kk,k,ds) & + !$omp shared(nye,nxe,xa_a,xbh_b,nxen,ybh_b,ya_a,nyen,fv3ixens,fv3ixpens,fv3jyens,fv3jypens,fv3dyens,fv3dy1ens,fv3dxens,fv3dx1ens,bilinear) & + !$omp firstprivate(jb1,ib1) do j=1,nye do i=1,nxe do n=1,3 @@ -892,6 +1115,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l allocate (a3dxens(nyen,nxen),a3dx1ens(nyen,nxen),a3dyens(nyen,nxen),a3dy1ens(nyen,nxen)) allocate (a3ixens(nyen,nxen),a3ixpens(nyen,nxen),a3jyens(nyen,nxen),a3jypens(nyen,nxen)) + !$omp parallel do default(none),private(j,i,gxa) & + !$omp shared(nyen,nxen,xbh_b,xa_a,nxe,a3ixens,a3dxens,a3dx1ens,a3ixpens) do i=1,nxen do j=1,nyen gxa=xbh_b(i,j) @@ -910,6 +1135,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l end do end do + !$omp parallel do default(none),private(j,i,gya) & + !$omp shared(nyen,nxen,ybh_b,ya_a,nye,a3jyens,a3dyens,a3dy1ens,a3jypens) do i=1,nxen do j=1,nyen gya=ybh_b(i,j) @@ -935,7 +1162,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l allocate (canguens(nxen,nyen+1),sanguens(nxen,nyen+1),cangvens(nxen+1,nyen),sangvens(nxen+1,nyen)) ! 1. compute x,y,z at cell cornor from grid_lon, grid_lat - + !$omp parallel do default(none),private(j,i) & + !$omp shared(nyen,nxen,grid_lat,grid_lon,deg2rad,x,y,z) do j=1,nyen+1 do i=1,nxen+1 x(i,j)=cos(grid_lat(i,j)*deg2rad)*cos(grid_lon(i,j)*deg2rad) @@ -947,6 +1175,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l ! 2 find angles to E-W and N-S for U edges sq180=180._r_kind**2 + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xu,yu,zu,uval,ewval,nsval) & + !$omp shared(nyen,nxen,grid_lat,grid_lon,sq180,deg2rad,x,y,z,canguens,sanguens) do j=1,nyen+1 do i=1,nxen ! center lat/lon of the edge @@ -975,6 +1205,8 @@ subroutine definecoef_regular_grids(nxen,nyen,grid_lon,grid_lont,grid_lat,grid_l enddo ! 3 find angles to E-W and N-S for V edges + !$omp parallel do default(none),private(j,i,rlat,diff,rlon,xr,yr,zr,xv,yv,zv,vval,ewval,nsval) & + !$omp shared(nyen,nxen,grid_lat,grid_lon,sq180,deg2rad,x,y,z,cangvens,sangvens) do j=1,nyen do i=1,nxen+1 rlat=half*(grid_lat(i,j)+grid_lat(i,j+1)) @@ -1093,6 +1325,8 @@ subroutine fv3uv2earth(u,v,nx,ny,u_out,v_out) real(r_kind),intent( out) :: u_out(nx,ny),v_out(nx,ny) integer(i_kind) i,j + !$omp parallel do default(none), private(j,i), & + !$omp shared(ny,nx,u,sangv,v,sangu,cangu,cangv,u_out,v_out) do j=1,ny do i=1,nx u_out(i,j)=half *( (u(i,j)*sangv(i,j)-v(i,j)*sangu(i,j))/(cangu(i,j)*sangv(i,j)-sangu(i,j)*cangv(i,j)) & @@ -1194,6 +1428,8 @@ subroutine fv3_h_to_ll(b_in,a,nb,mb,na,ma,rev_flg) nbp=nb+1 if(rev_flg) then !!!!!!!!! reverse E-W and N-S + !$omp parallel do default(none), private(j,jr,i,ir), & + !$omp shared(mb,mbp,nb,nbp,b_in,b) do j=1,mb jr=mbp-j do i=1,nb @@ -1206,13 +1442,17 @@ subroutine fv3_h_to_ll(b_in,a,nb,mb,na,ma,rev_flg) endif !!!!!!!!! interpolate to A grid & reverse ij for array a(lat,lon) if(bilinear)then ! bilinear interpolation + !$omp parallel do default(none), private(j,i), & + !$omp shared(ma,na,b,a,fv3dx1,fv3dy1,fv3ix,fv3jy,fv3dy,fv3jyp,fv3dx,fv3ixp) do j=1,ma do i=1,na a(j,i)=fv3dx1(i,j)*(fv3dy1(i,j)*b(fv3ix (i,j),fv3jy(i,j))+fv3dy(i,j)*b(fv3ix (i,j),fv3jyp(i,j))) & +fv3dx (i,j)*(fv3dy1(i,j)*b(fv3ixp(i,j),fv3jy(i,j))+fv3dy(i,j)*b(fv3ixp(i,j),fv3jyp(i,j))) end do end do - else ! inverse-distance weighting average + else ! inverse-distance weighting average + !$omp parallel do default(none), private(j,i), & + !$omp shared(ma,na,b,a,fv3dx,fv3ix,fv3jy,fv3dy,fv3jyp,fv3dx1,fv3ixp,fv3dy1) do j=1,ma do i=1,na a(j,i)=fv3dx(i,j)*b(fv3ix (i,j),fv3jy(i,j))+fv3dy(i,j)*b(fv3ix (i,j),fv3jyp(i,j)) & @@ -1351,7 +1591,7 @@ subroutine fv3_ll_to_h(a,b,nxa,nya,nxb,nyb,rev_flg) do j=1,nxb jr=nxbp-j b(jr+ijr)=a3dy1(i,j)*(a3dx1(i,j)*a(a3jy (i,j),a3ix(i,j))+a3dx(i,j)*a(a3jy (i,j),a3ixp(i,j))) & - +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) + +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) end do end do else @@ -1360,7 +1600,7 @@ subroutine fv3_ll_to_h(a,b,nxa,nya,nxb,nyb,rev_flg) ijr=(i-1)*nxb do j=1,nxb b(j+ijr)=a3dy1(i,j)*(a3dx1(i,j)*a(a3jy (i,j),a3ix(i,j))+a3dx(i,j)*a(a3jy (i,j),a3ixp(i,j))) & - +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) + +a3dy (i,j)*(a3dx1(i,j)*a(a3jyp(i,j),a3ix(i,j))+a3dx(i,j)*a(a3jyp(i,j),a3ixp(i,j))) end do end do endif @@ -1418,7 +1658,8 @@ subroutine rotate2deg(rlon_in,rlat_in,rlon_out,rlat_out,rlon0,rlat0,nx,ny) real(r_kind) x,y,z, xt,yt,zt, xtt,ytt,ztt integer(i_kind) i,j - + !$omp parallel do default(none),private(j,i,x,y,z,xt,yt,zt,xtt,ytt,ztt) & + !$omp shared(ny,nx,rlat_in,rlon_in,deg2rad,rlon0,rlat0,rlat_out,rlon_out,rad2deg) do j=1,ny do i=1,nx ! 1. compute x,y,z from rlon_in, rlat_in @@ -1475,6 +1716,8 @@ subroutine unrotate2deg(rlon_in,rlat_in,rlon_out,rlat_out,rlon0,rlat0,nx,ny) real(r_kind) x,y,z, xt,yt,zt, xtt,ytt,ztt integer(i_kind) i,j + !$omp parallel do default(none),private(j,i,x,y,z,xt,yt,zt,xtt,ytt,ztt) & + !$omp shared(ny,nx,rlat_in,rlon_in,deg2rad,rlat0,rlon0,rlat_out,rlon_out,rad2deg) do j=1,ny do i=1,nx xtt=cos(rlat_out(i,j)*deg2rad)*cos(rlon_out(i,j)*deg2rad) diff --git a/src/gsi/pcgsoi.f90 b/src/gsi/pcgsoi.f90 index 0b808c5c5..266c43b77 100644 --- a/src/gsi/pcgsoi.f90 +++ b/src/gsi/pcgsoi.f90 @@ -159,6 +159,7 @@ subroutine pcgsoi() use berror, only: vprecond use stpjomod, only: stpjo_setup use intradmod, only: setrad + use mpi, only : MPI_Wtime, MPI_Comm_World, MPI_REAL8, MPI_MAX, MPI_SUCCESS implicit none @@ -191,6 +192,8 @@ subroutine pcgsoi() integer(i_kind) :: iortho logical :: print_verbose,ortho,diag_print logical :: lanlerr,read_success + real(kind=8) :: time_beg,time_end,walltime + integer(i_kind) :: ierr ! Step size diagnostic strings data step /'good', 'SMALL'/ @@ -638,7 +641,14 @@ subroutine pcgsoi() ! Write output analysis files if(.not.l4dvar) call prt_guess('analysis') call prt_state_norms(sval(1),'increment') - if (twodvar_regional .or. jiter == miter) call write_all(-1) + if (twodvar_regional .or. jiter == miter) then + time_beg=MPI_Wtime() + call write_all(-1) + time_end=MPI_Wtime() + call MPI_Reduce(time_end-time_beg, walltime, 1, MPI_REAL8, MPI_MAX, 0, MPI_COMM_WORLD, ierr) + if(ierr /= MPI_SUCCESS) print*,'MPI_Reduce ',ierr + if(mype==0) write(6,'("Maximum Walltime for write_all" f15.4)') walltime + endif ! Overwrite guess with increment (4d-var only, for now) if (iwrtinc>0) then From abf57f64c7a87abc49f47fbbc65fa16bd1a6f294 Mon Sep 17 00:00:00 2001 From: TingLei-NOAA <63461756+TingLei-NOAA@users.noreply.github.com> Date: Fri, 25 Jul 2025 12:35:02 -0400 Subject: [PATCH 09/12] fix for problems in ctests using debug mode of GSI release/rrfs1.0.0 (#862) This is a draft PR to show fix for problems when running ctests using debug mode built GSI at rrfs.1.0.0. --- ...enkfapp_compiler_flags_Intel_Fortran.cmake | 3 ++- src/enkf/enkf.f90 | 26 +++++++++++-------- src/enkf/loadbal.f90 | 5 ++++ src/gsi/anisofilter.f90 | 4 +-- src/gsi/combine_radobs.f90 | 12 +++++---- src/gsi/hybrid_ensemble_isotropic.F90 | 1 + src/gsi/read_iasi.f90 | 1 + 7 files changed, 33 insertions(+), 19 deletions(-) diff --git a/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake b/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake index 8ba2887da..6486f0996 100644 --- a/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake +++ b/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake @@ -14,7 +14,8 @@ set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fp-model strict") # DEBUG FLAGS #################################################################### -set(CMAKE_Fortran_FLAGS_DEBUG "-O0 -fp-model source -debug -ftrapuv -warn all,nointerfaces -check all,noarg_temp_created -fp-stack-check -fstack-protector") +set(CMAKE_Fortran_FLAGS_DEBUG "-O0 -traceback -check all -check bounds -init=arrays,snan -fpe0 -fp-model source -debug -ftrapuv -warn all,nointerfaces -check all,noarg_temp_created -fp-stack-check -fstack-protector") + #################################################################### # LINK FLAGS diff --git a/src/enkf/enkf.f90 b/src/enkf/enkf.f90 index 479f60c01..ac1566bf5 100644 --- a/src/enkf/enkf.f90 +++ b/src/enkf/enkf.f90 @@ -191,6 +191,7 @@ subroutine enkf_update() integer(i_kind) nanal,nn,nnn,nobm,nsame,nn1,nn2,oz_ind,nlev,dbz_ind real(r_single),dimension(nlevs_pres):: taperv logical lastiter, kdgrid, kdobs +real(r_single) :: meanval ! allocate temporary arrays. allocate(anal_obchunk(nanals,nobs_max)) @@ -764,22 +765,25 @@ subroutine enkf_update() ! make sure posterior perturbations still have zero mean. ! (roundoff errors can accumulate) + meanval=zero !use firstprivate to avoid "false sharing' if (lastiter .and. .not. lupd_obspace_serial) then - !$omp parallel do schedule(dynamic) private(npt,nb,i) - do npt=1,npts_max - do nb=1,nbackgrounds - do i=1,ncdim - anal_chunk(1:nanals,npt,i,nb) = anal_chunk(1:nanals,npt,i,nb)-& - sum(anal_chunk(1:nanals,npt,i,nb),1)*r_nanals - end do - end do - enddo + !$omp parallel do schedule(dynamic) private(npt, nb, i) firstprivate(meanval) + do npt = 1, npts_max + do nb = 1, nbackgrounds + do i = 1, ncdim + meanval = sum(anal_chunk(1:nanals, npt, i, nb), 1) * r_nanals + anal_chunk(1:nanals, npt, i, nb) = anal_chunk(1:nanals, npt, i, nb) - meanval + end do + end do + end do !$omp end parallel do endif - !$omp parallel do schedule(dynamic) private(nob) + meanval=zero !use firstprivate to avoid "false sharing" + !$omp parallel do schedule(dynamic) private(nob) firstprivate(meanval) do nob=1,nobs_max + meanval=sum(anal_obchunk(1:nanals,nob),1) anal_obchunk(1:nanals,nob) = anal_obchunk(1:nanals,nob)-& - sum(anal_obchunk(1:nanals,nob),1)*r_nanals + meanval*r_nanals enddo !$omp end parallel do diff --git a/src/enkf/loadbal.f90 b/src/enkf/loadbal.f90 index 3292c99ee..4378f4c53 100644 --- a/src/enkf/loadbal.f90 +++ b/src/enkf/loadbal.f90 @@ -167,6 +167,9 @@ subroutine load_balance() ! assume work load proportional to number of 'nearby' obs call estimate_work_enkf1(numobs) ! fill numobs array with number of obs per horiz point ! distribute the results of estimate_work to all processors. +!clt call mpi_barrier(mpi_comm_world,ierr) !added, for debug mode, on wcoss2 ,otherwise, +!clt !" MPICH suspects a hang due to rendezvous message resource exhaustion." +!clt commented out now for the noticeable degraded performance call mpi_allreduce(mpi_in_place,numobs,npts,mpi_integer,mpi_sum,mpi_comm_world,ierr) if (letkf_flag .and. nobsl_max > 0) then where(numobs > nobsl_max) numobs = nobsl_max @@ -298,6 +301,7 @@ subroutine load_balance() end if ! for serial enkf, create observation priors to be updated on each processor. allocate(anal_obchunk_prior(nanals,nobs_max)) + anal_obchunk_prior=zero do nob1=1,numobsperproc(nproc+1) nob2 = indxproc_obs(nproc+1,nob1) anal_obchunk_prior(1:nanals,nob1) = anal_ob(1:nanals,nob2) @@ -357,6 +361,7 @@ subroutine scatter_chunks allocate(rcounts(0:numproc-1)) ! allocate array to hold pieces of state vector on each proc. allocate(anal_chunk(nanals,npts_max,ncdim,nbackgrounds)) +anal_chunk=zero if (nproc == 0) print *,'anal_chunk size = ',size(anal_chunk,kind=8) ! only IO tasks send any data. diff --git a/src/gsi/anisofilter.f90 b/src/gsi/anisofilter.f90 index c05c764a0..2977fab58 100755 --- a/src/gsi/anisofilter.f90 +++ b/src/gsi/anisofilter.f90 @@ -5415,16 +5415,16 @@ subroutine get2berr_reg_subdomain_option(mype) ! ! CONVERT bckgvar4 FROM FILTER GRID TO ANALYSIS GRID BEFORE WRITING OUT ! + if(mype==0) then bckgvar8f=bckgvar4f call fgrid2agrid(pf2aP1,bckgvar8f,bckgvar8a) bckgvar4=bckgvar8a - if(mype==0) then ivar=jdvar(k) chvarname=fvarname(ivar) open (94,file='bckgvar.dat_'//trim(chvarname),form='unformatted') write(94) bckgvar4 close(94) - end if + end if enddo deallocate(bckgvar4t,bckgvar4f,bckgvar4,bckgvar8f,bckgvar8a,bckgvar0f) diff --git a/src/gsi/combine_radobs.f90 b/src/gsi/combine_radobs.f90 index 7692bdef3..ab1f814f2 100644 --- a/src/gsi/combine_radobs.f90 +++ b/src/gsi/combine_radobs.f90 @@ -25,11 +25,10 @@ subroutine combine_radobs(mype_sub,mype_root,& ! data_all - observation data array ! data_crit- array containing observation "best scores" ! nread - task specific number of obesrvations read from data file -! ndata - task specific number of observations keep for assimilation ! ! output argument list: ! nread - total number of observations read from data file (mype_root) -! ndata - total number of observations keep for assimilation (mype_root) +! ndata - total number of observation profiles kept for assimilation in the thinning box (mype_root) ! data_all - merged observation data array (mype_root) ! data_crit- merged array containing observation "best scores" (mype_root) ! @@ -50,7 +49,8 @@ subroutine combine_radobs(mype_sub,mype_root,& integer(i_kind) ,intent(in ) :: npe_sub,itxmax integer(i_kind) ,intent(in ) :: nele integer(i_kind) ,intent(in ) :: mpi_comm_sub - integer(i_kind) ,intent(inout) :: nread,ndata + integer(i_kind) ,intent(inout) :: nread + integer(i_kind) ,intent( out) :: ndata integer(i_kind),dimension(itxmax) ,intent(in ) :: nrec real(r_kind),dimension(itxmax) ,intent(inout) :: data_crit real(r_kind),dimension(nele,itxmax),intent(inout) :: data_all @@ -74,7 +74,7 @@ subroutine combine_radobs(mype_sub,mype_root,& nread=0 if (mype_sub==mype_root) nread = ncounts1 - if (ncounts1 == 0)return + if (ncounts1 <= 0)return ! Allocate arrays to hold data @@ -83,7 +83,7 @@ subroutine combine_radobs(mype_sub,mype_root,& ! is only needed on task mype_root call mpi_allreduce(data_crit,data_crit_min,itxmax,mpi_rtype,mpi_min,mpi_comm_sub,ierror) - allocate(nloc(min(ncounts1,itxmax)),icrit(min(ncounts1,itxmax))) + allocate(nloc(itxmax),icrit(itxmax)) icrit=1e9 ndata=0 ndata1=0 @@ -116,6 +116,7 @@ subroutine combine_radobs(mype_sub,mype_root,& end if deallocate(icrit) allocate(data_all_in(nele,ndata)) + data_all_in=zero !$omp parallel do private(kk,k,l) do kk=1,ndata k=nloc(kk) @@ -131,6 +132,7 @@ subroutine combine_radobs(mype_sub,mype_root,& end do deallocate(nloc) + ! get all data on process mype_root ! data_all(:,:) = zero diff --git a/src/gsi/hybrid_ensemble_isotropic.F90 b/src/gsi/hybrid_ensemble_isotropic.F90 index d1c8a7211..b231a06d4 100644 --- a/src/gsi/hybrid_ensemble_isotropic.F90 +++ b/src/gsi/hybrid_ensemble_isotropic.F90 @@ -1085,6 +1085,7 @@ subroutine normal_new_factorization_rf_x endif ! File didnt exist so we computed the data. Now save it for subsequent runs. if(mype==0) then + inunit=2000+mype open(inunit,file=trim(input),form='unformatted',action='write') write(inunit) xnorm_new close(inunit) diff --git a/src/gsi/read_iasi.f90 b/src/gsi/read_iasi.f90 index edd7a9b50..d2d221efd 100644 --- a/src/gsi/read_iasi.f90 +++ b/src/gsi/read_iasi.f90 @@ -434,6 +434,7 @@ subroutine read_iasi(mype,val_iasi,ithin,isfcalc,rmesh,jsatid,gstime,& ! The number of channels in obtained from the satinfo file being used. nele=nreal+satinfo_nchan allocate(data_all(nele,itxmax),nrec(itxmax)) + data_all=zero allocate(temperature(1)) ! dependent on # of channels in the bufr file allocate(allchan(2,1)) ! actual values set after ireadsb allocate(bufr_chan_test(1))! actual values set after ireadsb From ed2b482f034e67d2b0136e23899cec49aae1f166 Mon Sep 17 00:00:00 2001 From: RuifangLi7 Date: Mon, 20 Oct 2025 07:02:07 -0600 Subject: [PATCH 10/12] Update bufr_d to bufr_4 for bufr 12.1.0 version. (#941) --- src/gsi/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsi/CMakeLists.txt b/src/gsi/CMakeLists.txt index d7a02ed99..2a281efbd 100644 --- a/src/gsi/CMakeLists.txt +++ b/src/gsi/CMakeLists.txt @@ -156,7 +156,7 @@ target_link_libraries(gsi_fortran_obj PUBLIC nemsio::nemsio) target_link_libraries(gsi_fortran_obj PUBLIC ncio::ncio) target_link_libraries(gsi_fortran_obj PUBLIC w3emc::w3emc_d) target_link_libraries(gsi_fortran_obj PUBLIC sp::sp_d) -target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_d) +target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_4) target_link_libraries(gsi_fortran_obj PUBLIC crtm::crtm) target_link_libraries(gsi_fortran_obj PUBLIC ZLIB::ZLIB) if(GSI_MODE MATCHES "Regional") From fdfa5209530486d1694e9dc442ff404c09597458 Mon Sep 17 00:00:00 2001 From: MatthewPyle-NOAA <48285220+MatthewPyle-NOAA@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:28:53 -0500 Subject: [PATCH 11/12] [release/rrfs.v1.0.0] Adds changes needed for BUFR/12.2 (#963) This makes small changes needed for this branch to work with BUFR/12.2, which is the version being used for the RRFS implementation. --- src/gsi/read_pblh.f90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gsi/read_pblh.f90 b/src/gsi/read_pblh.f90 index 1ff2cd2c2..44d264ec1 100644 --- a/src/gsi/read_pblh.f90 +++ b/src/gsi/read_pblh.f90 @@ -374,13 +374,13 @@ subroutine read_pblh(nread,ndata,nodata,infile,obstype,lunout,twindin,& ! OBLVL == SRC FHR ! == SRC FHR POB PMO QOB TOB ZOB UOB VOB ... ! {P,Q,T,Z,W}EVN ! cnem='SRC FHR POB PMO QOB TOB ZOB UOB VOB' -! call ufbin3(lunin,olv,MXNM,MXRP,MXRP, nlevp,nlevo,cnem) +! call ufbevn(lunin,olv,MXNM,MXRP,MXRP, nlevp,nlevo,cnem) ! CEVN == CAPE CINH LI PBL TROP PWO ! cnem='CAPE CINH LI PBL TROP PWO' cnem='PBL' ! for Caterina's files (ruc_raobs) clv=bmiss - call ufbin3(lunin,clv,MXNM,MXRP,MXRP, nlevp,nlevc,cnem) + call ufbevn(lunin,clv,MXNM,MXRP,MXRP, nlevp,nlevc,cnem) ! pblhob=clv(4,1,2) pblhob=clv(1,1,2) pblbak=clv(1,1,1) ! model PBL; from Caterina's files @@ -422,7 +422,7 @@ subroutine read_pblh(nread,ndata,nodata,infile,obstype,lunout,twindin,& ! SEVN == HOVI MXTM MITM TDO TOCC MXGS THI TCH CDBZ ! cnem='HOVI MXTM MITM TDO TOCC MXGS THI TCH CDBZ' -! call ufbin3(lunin,slv,MXNM,MXRP,MXRP, nlevp,nlevs,cnem) +! call ufbevn(lunin,slv,MXNM,MXRP,MXRP, nlevp,nlevs,cnem) !--Outputs From bcc8117d886820d38bf11b7597103ccf48e3fead Mon Sep 17 00:00:00 2001 From: ShunLiu-NOAA Date: Wed, 10 Dec 2025 08:38:19 -0500 Subject: [PATCH 12/12] add sat ID of goes19 to read-in list (#967) Add the GOES-19 satellite ID to the read-in list so that RRFSv1 can properly use abi_g19 data. --- src/gsi/read_abi.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gsi/read_abi.f90 b/src/gsi/read_abi.f90 index eaa6b1675..103b8ce54 100644 --- a/src/gsi/read_abi.f90 +++ b/src/gsi/read_abi.f90 @@ -240,6 +240,8 @@ subroutine read_abi(mype,val_abi,ithin,rmesh,jsatid,& kidsat = 271 elseif (jsatid == 'g18') then kidsat = 272 + elseif (jsatid == 'g19') then + kidsat = 273 else write(6,*) 'READ_ABI: Unrecognized value for jsatid '//jsatid//': RETURNING' return