diff --git a/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake b/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake index 8ba2887da..6486f0996 100644 --- a/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake +++ b/src/enkf/cmake/enkfapp_compiler_flags_Intel_Fortran.cmake @@ -14,7 +14,8 @@ set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fp-model strict") # DEBUG FLAGS #################################################################### -set(CMAKE_Fortran_FLAGS_DEBUG "-O0 -fp-model source -debug -ftrapuv -warn all,nointerfaces -check all,noarg_temp_created -fp-stack-check -fstack-protector") +set(CMAKE_Fortran_FLAGS_DEBUG "-O0 -traceback -check all -check bounds -init=arrays,snan -fpe0 -fp-model source -debug -ftrapuv -warn all,nointerfaces -check all,noarg_temp_created -fp-stack-check -fstack-protector") + #################################################################### # LINK FLAGS diff --git a/src/enkf/enkf.f90 b/src/enkf/enkf.f90 index 479f60c01..ac1566bf5 100644 --- a/src/enkf/enkf.f90 +++ b/src/enkf/enkf.f90 @@ -191,6 +191,7 @@ subroutine enkf_update() integer(i_kind) nanal,nn,nnn,nobm,nsame,nn1,nn2,oz_ind,nlev,dbz_ind real(r_single),dimension(nlevs_pres):: taperv logical lastiter, kdgrid, kdobs +real(r_single) :: meanval ! allocate temporary arrays. allocate(anal_obchunk(nanals,nobs_max)) @@ -764,22 +765,25 @@ subroutine enkf_update() ! make sure posterior perturbations still have zero mean. ! (roundoff errors can accumulate) + meanval=zero !use firstprivate to avoid "false sharing' if (lastiter .and. .not. lupd_obspace_serial) then - !$omp parallel do schedule(dynamic) private(npt,nb,i) - do npt=1,npts_max - do nb=1,nbackgrounds - do i=1,ncdim - anal_chunk(1:nanals,npt,i,nb) = anal_chunk(1:nanals,npt,i,nb)-& - sum(anal_chunk(1:nanals,npt,i,nb),1)*r_nanals - end do - end do - enddo + !$omp parallel do schedule(dynamic) private(npt, nb, i) firstprivate(meanval) + do npt = 1, npts_max + do nb = 1, nbackgrounds + do i = 1, ncdim + meanval = sum(anal_chunk(1:nanals, npt, i, nb), 1) * r_nanals + anal_chunk(1:nanals, npt, i, nb) = anal_chunk(1:nanals, npt, i, nb) - meanval + end do + end do + end do !$omp end parallel do endif - !$omp parallel do schedule(dynamic) private(nob) + meanval=zero !use firstprivate to avoid "false sharing" + !$omp parallel do schedule(dynamic) private(nob) firstprivate(meanval) do nob=1,nobs_max + meanval=sum(anal_obchunk(1:nanals,nob),1) anal_obchunk(1:nanals,nob) = anal_obchunk(1:nanals,nob)-& - sum(anal_obchunk(1:nanals,nob),1)*r_nanals + meanval*r_nanals enddo !$omp end parallel do diff --git a/src/enkf/loadbal.f90 b/src/enkf/loadbal.f90 index 3292c99ee..4378f4c53 100644 --- a/src/enkf/loadbal.f90 +++ b/src/enkf/loadbal.f90 @@ -167,6 +167,9 @@ subroutine load_balance() ! assume work load proportional to number of 'nearby' obs call estimate_work_enkf1(numobs) ! fill numobs array with number of obs per horiz point ! distribute the results of estimate_work to all processors. +!clt call mpi_barrier(mpi_comm_world,ierr) !added, for debug mode, on wcoss2 ,otherwise, +!clt !" MPICH suspects a hang due to rendezvous message resource exhaustion." +!clt commented out now for the noticeable degraded performance call mpi_allreduce(mpi_in_place,numobs,npts,mpi_integer,mpi_sum,mpi_comm_world,ierr) if (letkf_flag .and. nobsl_max > 0) then where(numobs > nobsl_max) numobs = nobsl_max @@ -298,6 +301,7 @@ subroutine load_balance() end if ! for serial enkf, create observation priors to be updated on each processor. allocate(anal_obchunk_prior(nanals,nobs_max)) + anal_obchunk_prior=zero do nob1=1,numobsperproc(nproc+1) nob2 = indxproc_obs(nproc+1,nob1) anal_obchunk_prior(1:nanals,nob1) = anal_ob(1:nanals,nob2) @@ -357,6 +361,7 @@ subroutine scatter_chunks allocate(rcounts(0:numproc-1)) ! allocate array to hold pieces of state vector on each proc. allocate(anal_chunk(nanals,npts_max,ncdim,nbackgrounds)) +anal_chunk=zero if (nproc == 0) print *,'anal_chunk size = ',size(anal_chunk,kind=8) ! only IO tasks send any data. diff --git a/src/gsi/anisofilter.f90 b/src/gsi/anisofilter.f90 index c05c764a0..2977fab58 100755 --- a/src/gsi/anisofilter.f90 +++ b/src/gsi/anisofilter.f90 @@ -5415,16 +5415,16 @@ subroutine get2berr_reg_subdomain_option(mype) ! ! CONVERT bckgvar4 FROM FILTER GRID TO ANALYSIS GRID BEFORE WRITING OUT ! + if(mype==0) then bckgvar8f=bckgvar4f call fgrid2agrid(pf2aP1,bckgvar8f,bckgvar8a) bckgvar4=bckgvar8a - if(mype==0) then ivar=jdvar(k) chvarname=fvarname(ivar) open (94,file='bckgvar.dat_'//trim(chvarname),form='unformatted') write(94) bckgvar4 close(94) - end if + end if enddo deallocate(bckgvar4t,bckgvar4f,bckgvar4,bckgvar8f,bckgvar8a,bckgvar0f) diff --git a/src/gsi/combine_radobs.f90 b/src/gsi/combine_radobs.f90 index 7692bdef3..ab1f814f2 100644 --- a/src/gsi/combine_radobs.f90 +++ b/src/gsi/combine_radobs.f90 @@ -25,11 +25,10 @@ subroutine combine_radobs(mype_sub,mype_root,& ! data_all - observation data array ! data_crit- array containing observation "best scores" ! nread - task specific number of obesrvations read from data file -! ndata - task specific number of observations keep for assimilation ! ! output argument list: ! nread - total number of observations read from data file (mype_root) -! ndata - total number of observations keep for assimilation (mype_root) +! ndata - total number of observation profiles kept for assimilation in the thinning box (mype_root) ! data_all - merged observation data array (mype_root) ! data_crit- merged array containing observation "best scores" (mype_root) ! @@ -50,7 +49,8 @@ subroutine combine_radobs(mype_sub,mype_root,& integer(i_kind) ,intent(in ) :: npe_sub,itxmax integer(i_kind) ,intent(in ) :: nele integer(i_kind) ,intent(in ) :: mpi_comm_sub - integer(i_kind) ,intent(inout) :: nread,ndata + integer(i_kind) ,intent(inout) :: nread + integer(i_kind) ,intent( out) :: ndata integer(i_kind),dimension(itxmax) ,intent(in ) :: nrec real(r_kind),dimension(itxmax) ,intent(inout) :: data_crit real(r_kind),dimension(nele,itxmax),intent(inout) :: data_all @@ -74,7 +74,7 @@ subroutine combine_radobs(mype_sub,mype_root,& nread=0 if (mype_sub==mype_root) nread = ncounts1 - if (ncounts1 == 0)return + if (ncounts1 <= 0)return ! Allocate arrays to hold data @@ -83,7 +83,7 @@ subroutine combine_radobs(mype_sub,mype_root,& ! is only needed on task mype_root call mpi_allreduce(data_crit,data_crit_min,itxmax,mpi_rtype,mpi_min,mpi_comm_sub,ierror) - allocate(nloc(min(ncounts1,itxmax)),icrit(min(ncounts1,itxmax))) + allocate(nloc(itxmax),icrit(itxmax)) icrit=1e9 ndata=0 ndata1=0 @@ -116,6 +116,7 @@ subroutine combine_radobs(mype_sub,mype_root,& end if deallocate(icrit) allocate(data_all_in(nele,ndata)) + data_all_in=zero !$omp parallel do private(kk,k,l) do kk=1,ndata k=nloc(kk) @@ -131,6 +132,7 @@ subroutine combine_radobs(mype_sub,mype_root,& end do deallocate(nloc) + ! get all data on process mype_root ! data_all(:,:) = zero diff --git a/src/gsi/hybrid_ensemble_isotropic.F90 b/src/gsi/hybrid_ensemble_isotropic.F90 index d1c8a7211..b231a06d4 100644 --- a/src/gsi/hybrid_ensemble_isotropic.F90 +++ b/src/gsi/hybrid_ensemble_isotropic.F90 @@ -1085,6 +1085,7 @@ subroutine normal_new_factorization_rf_x endif ! File didnt exist so we computed the data. Now save it for subsequent runs. if(mype==0) then + inunit=2000+mype open(inunit,file=trim(input),form='unformatted',action='write') write(inunit) xnorm_new close(inunit) diff --git a/src/gsi/read_iasi.f90 b/src/gsi/read_iasi.f90 index edd7a9b50..d2d221efd 100644 --- a/src/gsi/read_iasi.f90 +++ b/src/gsi/read_iasi.f90 @@ -434,6 +434,7 @@ subroutine read_iasi(mype,val_iasi,ithin,isfcalc,rmesh,jsatid,gstime,& ! The number of channels in obtained from the satinfo file being used. nele=nreal+satinfo_nchan allocate(data_all(nele,itxmax),nrec(itxmax)) + data_all=zero allocate(temperature(1)) ! dependent on # of channels in the bufr file allocate(allchan(2,1)) ! actual values set after ireadsb allocate(bufr_chan_test(1))! actual values set after ireadsb