diff --git a/NEWS.md b/NEWS.md index 1bd1ae6a..82d5842c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,12 @@ main `Utils.isequispaced` is now more efficient: it fails fast and does not allocate as much. More redundant allocations due to `Utils.isequispaced` were fixed. +### Features + +- Reduced allocations in regridding. New method `read!`. +Existing `read` now returns a copy of the data when returning from the cache. +PR [#119](https://github.com/CliMA/ClimaUtilities.jl/pull/119) + v0.1.15 ------- diff --git a/docs/src/filereaders.md b/docs/src/filereaders.md index 1b4d689c..7134b795 100644 --- a/docs/src/filereaders.md +++ b/docs/src/filereaders.md @@ -23,9 +23,10 @@ The only file reader currently implemented is the `NCFileReader`, used to read NetCDF files. Each `NCFileReader` is associated to one particular file and variable (but multiple `NCFileReader`s can share the same file). -Once created, `NCFileReader` is accessed with the `read(file_reader, date)` +Once created, `NCFileReader` is accessed with the `read!(file_reader, date)` function, which returns the `Array` associated to given `date` (if available). -The `date` can be omitted if the data is static. +The `date` can be omitted if the data is static. The data is stored in a +preallocated array so it can be accessed multiple times without reallocating. `NCFileReader`s implement two additional features: (1) optional preprocessing, and (2) cache reads. `NCFileReader`s can be created with a `preprocessing_func` @@ -78,6 +79,7 @@ close(v_var) ```@docs ClimaUtilities.FileReaders.NCFileReader ClimaUtilities.FileReaders.read +ClimaUtilities.FileReaders.read! ClimaUtilities.FileReaders.available_dates ClimaUtilities.FileReaders.close_all_ncfiles Base.close diff --git a/ext/DataHandlingExt.jl b/ext/DataHandlingExt.jl index fbe4c5ef..292fe992 100644 --- a/ext/DataHandlingExt.jl +++ b/ext/DataHandlingExt.jl @@ -8,7 +8,7 @@ import ClimaCore: ClimaComms import ClimaUtilities.DataStructures import ClimaUtilities.Regridders -import ClimaUtilities.FileReaders: AbstractFileReader, NCFileReader, read +import ClimaUtilities.FileReaders: AbstractFileReader, NCFileReader, read, read! import ClimaUtilities.Regridders: AbstractRegridder, regrid import ClimaUtilities.Utils: isequispaced, period_to_seconds_float @@ -57,6 +57,7 @@ struct DataHandler{ CACHE <: DataStructures.LRUCache{Dates.DateTime, ClimaCore.Fields.Field}, FUNC <: Function, NAMES <: AbstractArray{<:AbstractString}, + PR <: AbstractDict{<:AbstractString, <:AbstractArray}, } """Dictionary of variable names and objects responsible for getting the input data from disk to memory""" file_readers::FR @@ -87,6 +88,9 @@ struct DataHandler{ """Names of the datasets in the NetCDF that have to be read and processed""" varnames::NAMES + + """Preallocated memory for storing read dataset""" + preallocated_read_data::PR end """ @@ -105,6 +109,8 @@ In the latter case, the entries of `file_paths` and `varnames` are expected to m The DataHandler maintains an LRU cache of Fields that were previously computed. +Creating this object results in the file being accessed (to preallocate some memory). + Positional arguments ===================== @@ -264,6 +270,13 @@ function DataHandling.DataHandler( available_times = period_to_seconds_float.(available_dates .- start_date) dimensions = first(values(file_readers)).dimensions + # Preallocate space for each variable to be read + one_date = isempty(available_dates) ? () : (first(available_dates),) + preallocated_read_data = Dict( + varname => read(file_readers[varname], one_date...) for + varname in varnames + ) + return DataHandler( file_readers, regridder, @@ -275,6 +288,7 @@ function DataHandling.DataHandler( _cached_regridded_fields, compose_function, varnames, + preallocated_read_data, ) end @@ -460,11 +474,19 @@ function DataHandling.regridded_snapshot( regrid_args = (date,) end elseif regridder_type == :InterpolationsRegridder + # Read input data from each file, maintaining order, and apply composing function # In the case of a single input variable, it will remain unchanged + for varname in varnames + read!( + data_handler.preallocated_read_data[varname], + data_handler.file_readers[varname], + date, + ) + end data_composed = compose_function( ( - read(data_handler.file_readers[varname], date) for + data_handler.preallocated_read_data[varname] for varname in varnames )..., ) diff --git a/ext/NCFileReaderExt.jl b/ext/NCFileReaderExt.jl index 1c565a0d..0baffa83 100644 --- a/ext/NCFileReaderExt.jl +++ b/ext/NCFileReaderExt.jl @@ -174,6 +174,14 @@ end Read and preprocess the data at the given `date`. """ function FileReaders.read(file_reader::NCFileReader, date::Dates.DateTime) + # For cache hits, return a copy to give away ownership of the data (if we were to just + # return _cached_reads[date], modifying the return value would modify the private state + # of the file reader) + + if haskey(file_reader._cached_reads, date) + return copy(file_reader._cached_reads[date]) + end + # DateTime(0) is the sentinel value for static datasets if date == Dates.DateTime(0) return get!(file_reader._cached_reads, date) do @@ -188,16 +196,14 @@ function FileReaders.read(file_reader::NCFileReader, date::Dates.DateTime) error("Problem with date $date in $(file_reader.file_path)") index = index[] - return get!(file_reader._cached_reads, date) do - var = file_reader.dataset[file_reader.varname] - slicer = [ - i == file_reader.time_index ? index : Colon() for - i in 1:length(NCDatasets.dimnames(var)) - ] - return file_reader.preprocess_func.( - file_reader.dataset[file_reader.varname][slicer...] - ) - end + var = file_reader.dataset[file_reader.varname] + slicer = [ + i == file_reader.time_index ? index : Colon() for + i in 1:length(NCDatasets.dimnames(var)) + ] + return file_reader.preprocess_func.( + file_reader.dataset[file_reader.varname][slicer...] + ) end """ @@ -226,4 +232,28 @@ function FileReaders.read(file_reader::NCFileReader) end end +""" + read!(dest, file_reader::NCFileReader) + +Read and preprocess data (for static datasets), saving the output to `dest`. +""" +function FileReaders.read!(dest, file_reader::NCFileReader) + dest .= FileReaders.read(file_reader) + return nothing +end + +""" + read!(dest, file_reader::NCFileReader, date::Dates.DateTime) + +Read and preprocess the data at the given `date`, saving the output to `dest`. +""" +function FileReaders.read!( + dest, + file_reader::NCFileReader, + date::Dates.DateTime, +) + dest .= FileReaders.read(file_reader, date) + return nothing +end + end diff --git a/src/FileReaders.jl b/src/FileReaders.jl index 5299abac..28e6aeaf 100644 --- a/src/FileReaders.jl +++ b/src/FileReaders.jl @@ -17,6 +17,8 @@ function NCFileReader end function read end +function read! end + function available_dates end function close_all_ncfiles end diff --git a/test/file_readers.jl b/test/file_readers.jl index 024a3292..310d7e29 100644 --- a/test/file_readers.jl +++ b/test/file_readers.jl @@ -37,6 +37,12 @@ using NCDatasets @test FileReaders.read(ncreader_u, DateTime(2021, 01, 01, 01)) == nc["u10n"][:, :, 2] + # Test read! + dest = copy(nc["u10n"][:, :, 2]) + fill!(dest, 0) + FileReaders.read!(dest, ncreader_u, DateTime(2021, 01, 01, 01)) + @test dest == nc["u10n"][:, :, 2] + # Test that we need to close all the variables to close the file open_ncfiles = Base.get_extension( @@ -73,6 +79,12 @@ end @test FileReaders.read(ncreader) == nc["u10n"][:, :] + # Test read! + dest = copy(nc["u10n"][:, :]) + fill!(dest, 0) + FileReaders.read!(dest, ncreader) + @test dest == nc["u10n"][:, :] + @test isempty(FileReaders.available_dates(ncreader)) FileReaders.close_all_ncfiles()