CliMA · juliasloan25 · Oct 21, 2024 · Jul 31, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -12,6 +12,12 @@ main
 `Utils.isequispaced` is now more efficient: it fails fast and does not allocate
 as much. More redundant allocations due to `Utils.isequispaced` were fixed.
 
+### Features
+
+- Reduced allocations in regridding. New method `read!`.
+Existing `read` now returns a copy of the data when returning from the cache.
+PR [#119](https://github.com/CliMA/ClimaUtilities.jl/pull/119)
+
 v0.1.15
 -------
 

diff --git a/docs/src/filereaders.md b/docs/src/filereaders.md
@@ -23,9 +23,10 @@ The only file reader currently implemented is the `NCFileReader`, used to read
 NetCDF files. Each `NCFileReader` is associated to one particular file and
 variable (but multiple `NCFileReader`s can share the same file).
 
-Once created, `NCFileReader` is accessed with the `read(file_reader, date)`
+Once created, `NCFileReader` is accessed with the `read!(file_reader, date)`
 function, which returns the `Array` associated to given `date` (if available).
-The `date` can be omitted if the data is static.
+The `date` can be omitted if the data is static. The data is stored in a
+preallocated array so it can be accessed multiple times without reallocating.
 
 `NCFileReader`s implement two additional features: (1) optional preprocessing,
 and (2) cache reads. `NCFileReader`s can be created with a `preprocessing_func`
@@ -78,6 +79,7 @@ close(v_var)
 ```@docs
 ClimaUtilities.FileReaders.NCFileReader
 ClimaUtilities.FileReaders.read
+ClimaUtilities.FileReaders.read!
 ClimaUtilities.FileReaders.available_dates
 ClimaUtilities.FileReaders.close_all_ncfiles
 Base.close

diff --git a/ext/DataHandlingExt.jl b/ext/DataHandlingExt.jl
@@ -8,7 +8,7 @@ import ClimaCore: ClimaComms
 
 import ClimaUtilities.DataStructures
 import ClimaUtilities.Regridders
-import ClimaUtilities.FileReaders: AbstractFileReader, NCFileReader, read
+import ClimaUtilities.FileReaders: AbstractFileReader, NCFileReader, read, read!
 import ClimaUtilities.Regridders: AbstractRegridder, regrid
 
 import ClimaUtilities.Utils: isequispaced, period_to_seconds_float
@@ -57,6 +57,7 @@ struct DataHandler{
     CACHE <: DataStructures.LRUCache{Dates.DateTime, ClimaCore.Fields.Field},
     FUNC <: Function,
     NAMES <: AbstractArray{<:AbstractString},
+    PR <: AbstractDict{<:AbstractString, <:AbstractArray},
 }
     """Dictionary of variable names and objects responsible for getting the input data from disk to memory"""
     file_readers::FR
@@ -87,6 +88,9 @@ struct DataHandler{
 
     """Names of the datasets in the NetCDF that have to be read and processed"""
     varnames::NAMES
+
+    """Preallocated memory for storing read dataset"""
+    preallocated_read_data::PR
 end
 
 """
@@ -105,6 +109,8 @@ In the latter case, the entries of `file_paths` and `varnames` are expected to m
 
 The DataHandler maintains an LRU cache of Fields that were previously computed.
 
+Creating this object results in the file being accessed (to preallocate some memory).
+
 Positional arguments
 =====================
 
@@ -264,6 +270,13 @@ function DataHandling.DataHandler(
     available_times = period_to_seconds_float.(available_dates .- start_date)
     dimensions = first(values(file_readers)).dimensions
 
+    # Preallocate space for each variable to be read
+    one_date = isempty(available_dates) ? () : (first(available_dates),)
+    preallocated_read_data = Dict(
+        varname => read(file_readers[varname], one_date...) for
+        varname in varnames
+    )
+
     return DataHandler(
         file_readers,
         regridder,
@@ -275,6 +288,7 @@ function DataHandling.DataHandler(
         _cached_regridded_fields,
         compose_function,
         varnames,
+        preallocated_read_data,
     )
 end
 
@@ -460,11 +474,19 @@ function DataHandling.regridded_snapshot(
                 regrid_args = (date,)
             end
         elseif regridder_type == :InterpolationsRegridder
+
             # Read input data from each file, maintaining order, and apply composing function
             # In the case of a single input variable, it will remain unchanged
+            for varname in varnames
+                read!(
+                    data_handler.preallocated_read_data[varname],
+                    data_handler.file_readers[varname],
+                    date,
+                )
+            end
             data_composed = compose_function(
                 (
-                    read(data_handler.file_readers[varname], date) for
+                    data_handler.preallocated_read_data[varname] for
                     varname in varnames
                 )...,
             )

diff --git a/ext/NCFileReaderExt.jl b/ext/NCFileReaderExt.jl
@@ -174,6 +174,14 @@ end
 Read and preprocess the data at the given `date`.
 """
 function FileReaders.read(file_reader::NCFileReader, date::Dates.DateTime)
+    # For cache hits, return a copy to give away ownership of the data (if we were to just
+    # return _cached_reads[date], modifying the return value would modify the private state
+    # of the file reader)
+
+    if haskey(file_reader._cached_reads, date)
+        return copy(file_reader._cached_reads[date])
+    end
+
     # DateTime(0) is the sentinel value for static datasets
     if date == Dates.DateTime(0)
         return get!(file_reader._cached_reads, date) do
@@ -188,16 +196,14 @@ function FileReaders.read(file_reader::NCFileReader, date::Dates.DateTime)
         error("Problem with date $date in $(file_reader.file_path)")
     index = index[]
 
-    return get!(file_reader._cached_reads, date) do
-        var = file_reader.dataset[file_reader.varname]
-        slicer = [
-            i == file_reader.time_index ? index : Colon() for
-            i in 1:length(NCDatasets.dimnames(var))
-        ]
-        return file_reader.preprocess_func.(
-            file_reader.dataset[file_reader.varname][slicer...]
-        )
-    end
+    var = file_reader.dataset[file_reader.varname]
+    slicer = [
+        i == file_reader.time_index ? index : Colon() for
+        i in 1:length(NCDatasets.dimnames(var))
+    ]
+    return file_reader.preprocess_func.(
+        file_reader.dataset[file_reader.varname][slicer...]
+    )
 end
 
 """
@@ -226,4 +232,28 @@ function FileReaders.read(file_reader::NCFileReader)
     end
 end
 
+"""
+    read!(dest, file_reader::NCFileReader)
+
+Read and preprocess data (for static datasets), saving the output to `dest`.
+"""
+function FileReaders.read!(dest, file_reader::NCFileReader)
+    dest .= FileReaders.read(file_reader)
+    return nothing
+end
+
+"""
+    read!(dest, file_reader::NCFileReader, date::Dates.DateTime)
+
+Read and preprocess the data at the given `date`, saving the output to `dest`.
+"""
+function FileReaders.read!(
+    dest,
+    file_reader::NCFileReader,
+    date::Dates.DateTime,
+)
+    dest .= FileReaders.read(file_reader, date)
+    return nothing
+end
+
 end
diff --git a/src/FileReaders.jl b/src/FileReaders.jl
@@ -17,6 +17,8 @@ function NCFileReader end
 
 function read end
 
+function read! end
+
 function available_dates end
 
 function close_all_ncfiles end

diff --git a/test/file_readers.jl b/test/file_readers.jl
@@ -37,6 +37,12 @@ using NCDatasets
         @test FileReaders.read(ncreader_u, DateTime(2021, 01, 01, 01)) ==
               nc["u10n"][:, :, 2]
 
+        # Test read!
+        dest = copy(nc["u10n"][:, :, 2])
+        fill!(dest, 0)
+        FileReaders.read!(dest, ncreader_u, DateTime(2021, 01, 01, 01))
+        @test dest == nc["u10n"][:, :, 2]
+
         # Test that we need to close all the variables to close the file
         open_ncfiles =
             Base.get_extension(
@@ -73,6 +79,12 @@ end
 
         @test FileReaders.read(ncreader) == nc["u10n"][:, :]
 
+        # Test read!
+        dest = copy(nc["u10n"][:, :])
+        fill!(dest, 0)
+        FileReaders.read!(dest, ncreader)
+        @test dest == nc["u10n"][:, :]
+
         @test isempty(FileReaders.available_dates(ncreader))
 
         FileReaders.close_all_ncfiles()