From 976ec2a7bfdf508c527cee305d00d39d138d28c0 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <mkitti@users.noreply.github.com>
Date: Tue, 7 Jun 2022 16:50:01 -0400
Subject: [PATCH] Add copyto! and similar for Datasets (#937)

* Add read!, copyto! and similar for Datasets

* Remove views code from this pull request

* Remove read!, refactor similar

* Fix normalize keyword, add to tests

* Use Dims

* Add handle hygine to _generic_read

* Fix normalize keyword

* More normalize keyword fixes

* Fix normalize
---
 src/HDF5.jl           | 268 +++++++++++++++++++++++++++++++++++-------
 test/nonallocating.jl |  39 ++++++
 test/runtests.jl      |   2 +
 3 files changed, 267 insertions(+), 42 deletions(-)
 create mode 100644 test/nonallocating.jl

diff --git a/src/HDF5.jl b/src/HDF5.jl
index 566fd663b..806df46b2 100644
--- a/src/HDF5.jl
+++ b/src/HDF5.jl
@@ -859,10 +859,32 @@ function Base.read(obj::DatasetOrAttribute, ::Type{String}, I...)
     return val
 end
 
+"""
+    copyto!(output_buffer::AbstractArray{T}, obj::Union{DatasetOrAttribute}) where T
+
+Copy [part of] a HDF5 dataset or attribute to a preallocated output buffer.
+The output buffer must be convertible to a pointer and have a contiguous layout.
+"""
+function Base.copyto!(output_buffer::AbstractArray{T}, obj::DatasetOrAttribute, I...) where T
+    dtype = datatype(obj)
+    val = nothing
+    try
+        val = generic_read!(output_buffer, obj, dtype, T, I...)
+    finally
+        close(dtype)
+    end
+    return val
+end
+
 # Special handling for reading OPAQUE datasets and attributes
-function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque})
+function generic_read!(buf::Matrix{UInt8}, obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque})
+    generic_read(obj, filetype, Opaque, buf)
+end
+function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}, buf::Union{Matrix{UInt8}, Nothing} = nothing)
     sz  = size(obj)
-    buf = Matrix{UInt8}(undef, sizeof(filetype), prod(sz))
+    if isnothing(buf)
+        buf = Matrix{UInt8}(undef, sizeof(filetype), prod(sz))
+    end
     if obj isa Dataset
         read_dataset(obj, filetype, buf, obj.xfer)
     else
@@ -880,11 +902,164 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque
 end
 
 # generic read function
+function generic_read!(buf::Union{AbstractMatrix{UInt8}, AbstractArray{T}}, obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I...) where T
+    return _generic_read(obj, filetype, T, buf, I...)
+end
 function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I...) where T
+    return _generic_read(obj, filetype, T, nothing, I...)
+end
+function _generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T},
+    buf::Union{AbstractMatrix{UInt8}, AbstractArray{T}, Nothing}, I...) where T
+
+    sz, scalar, dspace = _size_of_buffer(obj, I)
+
+    if isempty(sz)
+        close(dspace)
+        return EmptyArray{T}()
+    end
+
+    try
+        if isnothing(buf)
+            buf = _normalized_buffer(T, sz)
+        else
+            sizeof(buf) != prod(sz)*sizeof(T) &&
+                error("Provided array buffer of size, $(size(buf)), and element type, $(eltype(buf)), does not match the dataset of size, $sz, and type, $T")
+        end
+    catch err
+        close(dspace)
+        rethrow(err)
+    end
+
+    memtype = _memtype(filetype, T)
+    memspace = isempty(I) ? dspace : dataspace(sz)
+
+    try
+        if obj isa Dataset
+            API.h5d_read(obj, memtype, memspace, dspace, obj.xfer, buf)
+        else
+            API.h5a_read(obj, memtype, buf)
+        end
+
+        if do_normalize(T)
+            out = reshape(normalize_types(T, buf), sz...)
+        else
+            out = buf
+        end
+
+        xfer_id = obj isa Dataset ? obj.xfer.id : API.H5P_DEFAULT
+        do_reclaim(T) && API.h5d_vlen_reclaim(memtype, memspace, xfer_id, buf)
+
+        if scalar
+            return out[1]
+        else
+            return out
+        end
+
+    finally
+        close(memtype)
+        close(memspace)
+        close(dspace)
+    end
+end
+
+
+"""
+    similar(obj::DatasetOrAttribute, [::Type{T}], [dims::Integer...]; normalize = true)
+
+Return a `Array{T}` or `Matrix{UInt8}` to that can contain [part of] the dataset.
+
+The `normalize` keyword will normalize the buffer for string and array datatypes.
+"""
+function Base.similar(
+    obj::DatasetOrAttribute,
+    ::Type{T},
+    dims::Dims;
+    normalize::Bool = true
+) where T
+    filetype = datatype(obj)
+    try
+        return similar(obj, filetype, T, dims; normalize=normalize)
+    finally
+        close(filetype)
+    end
+end
+Base.similar(
+    obj::DatasetOrAttribute,
+    ::Type{T},
+    dims::Integer...;
+    normalize::Bool = true
+) where T = similar(obj, T, Int.(dims); normalize=normalize)
+
+# Base.similar without specifying the Julia type
+function Base.similar(obj::DatasetOrAttribute, dims::Dims; normalize::Bool = true)
+    filetype = datatype(obj)
+    try
+        T = get_jl_type(filetype)
+        return similar(obj, filetype, T, dims; normalize=normalize)
+    finally
+        close(filetype)
+    end
+end
+Base.similar(
+    obj::DatasetOrAttribute,
+    dims::Integer...;
+    normalize::Bool = true
+) = similar(obj, Int.(dims); normalize=normalize)
+
+# Opaque types
+function Base.similar(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}; normalize::Bool  = true)
+    # normalize keyword for consistency, but it is ignored for Opaque
+    sz  = size(obj)
+    return Matrix{UInt8}(undef, sizeof(filetype), prod(sz))
+end
+
+# Undocumented Base.similar signature allowing filetype to be specified
+function Base.similar(
+    obj::DatasetOrAttribute,
+    filetype::Datatype,
+    ::Type{T},
+    dims::Dims;
+    normalize::Bool = true
+) where T
+    # We are reusing code that expect indices
+    I = Base.OneTo.(dims)
+    sz, scalar, dspace = _size_of_buffer(obj, I)
+    memtype = _memtype(filetype, T)
+    try
+        buf = _normalized_buffer(T, sz)
+
+        if normalize && do_normalize(T)
+            buf = reshape(normalize_types(T, buf), sz)
+        end
+        
+        return buf
+    finally
+        close(dspace)
+        close(memtype)
+    end
+end
+Base.similar(
+    obj::DatasetOrAttribute,
+    filetype::Datatype,
+    ::Type{T},
+    dims::Integer...;
+    normalize::Bool = true
+) where T = similar(obj, filetype, T, Int.(dims); normalize=normalize)
+
+# Utilities used in Base.similar implementation
+
+#=
+    _memtype(filetype::Datatype, T)
+
+This is a utility function originall from generic_read.
+It gets the native memory type for the system based on filetype, and checks
+if the size matches.
+=#
+@inline function _memtype(filetype::Datatype, ::Type{T}) where T
     !isconcretetype(T) && error("type $T is not concrete")
-    !isempty(I) && obj isa Attribute && error("HDF5 attributes do not support hyperslab selections")
 
-    memtype = Datatype(API.h5t_get_native_type(filetype))  # padded layout in memory
+    # padded layout in memory
+    memtype = Datatype(API.h5t_get_native_type(filetype))
 
     if sizeof(T) != sizeof(memtype)
         error("""
@@ -894,11 +1069,37 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I.
               """)
     end
 
-    dspace = dataspace(obj)
+    return memtype
+end
+
+#=
+    _size_of_buffer(obj::DatasetOrAttribute, [I::Tuple, dspace::Dataspace])
+
+This is a utility function originally from generic_read, but factored out.
+The primary purpose is to determine the size and shape of the buffer to
+create in order to hold the contents of a Dataset or Attribute.
+
+# Arguments
+* obj - A Dataset or Attribute
+* I - (optional) indices, defaults to ()
+* dspace - (optional) dataspace, defaults to dataspace(obj).
+           This argument will be consumed by hyperslab and returned.
+
+# Returns
+* `sz` the size of the selection
+* `scalar`, which is true if the value should be read as a scalar.
+* `dspace`, hyper
+=#
+@inline function _size_of_buffer(
+    obj::DatasetOrAttribute,
+    I::Tuple = (),
+    dspace::Dataspace = dataspace(obj)
+)
+   !isempty(I) && obj isa Attribute && error("HDF5 attributes do not support hyperslab selections")
+
     stype = API.h5s_get_simple_extent_type(dspace)
-    stype == API.H5S_NULL && return EmptyArray{T}()
 
-    if !isempty(I)
+    if !isempty(I) && stype != API.H5S_NULL
         indices = Base.to_indices(obj, I)
         dspace = hyperslab(dspace, indices...)
     end
@@ -907,16 +1108,32 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I.
     if stype == API.H5S_SCALAR
         sz = (1,)
         scalar = true
+    elseif stype == API.H5S_NULL
+        sz = ()
+        # scalar = false
     elseif isempty(I)
         sz = size(dspace)
+        # scalar = false
     else
+        # Determine the size by the length of non-Int indices
         sz = map(length, filter(i -> !isa(i, Int), indices))
         if isempty(sz)
+            # All indices are Int, so this is scalar
             sz = (1,)
             scalar = true
         end
     end
 
+    return sz, scalar, dspace
+end
+
+#=
+    _normalized_buffer(T, sz)
+
+Return a Matrix{UInt8} for a normalized type or `Array{T}` for a regular type.
+See `do_normalize` in typeconversions.jl.
+=#
+@inline function _normalized_buffer(::Type{T}, sz::NTuple{N, Int}) where {T, N}
     if do_normalize(T)
         # The entire dataset is read into in a buffer matrix where the first dimension at
         # any stage of normalization is the bytes for a single element of type `T`, and
@@ -925,32 +1142,8 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I.
     else
         buf = Array{T}(undef, sz...)
     end
-    memspace = isempty(I) ? dspace : dataspace(sz)
-
-    if obj isa Dataset
-        API.h5d_read(obj, memtype, memspace, dspace, obj.xfer, buf)
-    else
-        API.h5a_read(obj, memtype, buf)
-    end
-
-    if do_normalize(T)
-        out = reshape(normalize_types(T, buf), sz...)
-    else
-        out = buf
-    end
-
-    xfer_id = obj isa Dataset ? obj.xfer.id : API.H5P_DEFAULT
-    do_reclaim(T) && API.h5d_vlen_reclaim(memtype, memspace, xfer_id, buf)
 
-    close(memtype)
-    close(memspace)
-    close(dspace)
-
-    if scalar
-        return out[1]
-    else
-        return out
-    end
+    return buf
 end
 
 # Array constructor for datasets
@@ -1160,18 +1353,9 @@ function Base.setindex!(dset::Dataset, X::Array{T}, I::IndexType...) where T
     end
 
     filetype = datatype(dset)
-    memtype = Datatype(API.h5t_get_native_type(filetype))  # padded layout in memory
+    memtype = _memtype(filetype, eltype(X))
     close(filetype)
 
-    elT = eltype(X)
-    if sizeof(elT) != sizeof(memtype)
-        error("""
-              Type size mismatch
-              sizeof($elT) = $(sizeof(elT))
-              sizeof($memtype) = $(sizeof(memtype))
-              """)
-    end
-
     dspace = dataspace(dset)
     stype = API.h5s_get_simple_extent_type(dspace)
     stype == API.H5S_NULL && error("attempting to write to null dataspace")
diff --git a/test/nonallocating.jl b/test/nonallocating.jl
new file mode 100644
index 000000000..ff00699f3
--- /dev/null
+++ b/test/nonallocating.jl
@@ -0,0 +1,39 @@
+using HDF5
+using Test
+
+@testset "non-allocating methods" begin
+    fn = tempname()
+
+    data = rand(UInt16, 16, 16)
+
+    h5open(fn, "w") do h5f
+        h5f["data"] = data
+    end
+
+    h5open(fn, "r") do h5f
+        buffer = similar(h5f["data"])
+        copyto!(buffer, h5f["data"])
+        @test isequal(buffer, data)
+
+        # Consider making this a view later
+        v = h5f["data"][1:4, 1:4]
+
+        buffer = similar(v)
+        @test size(buffer) == (4,4)
+        copyto!(buffer, v)
+        @test isequal(buffer, @view(data[1:4, 1:4]))
+
+        @test size(similar(h5f["data"], Int16)) == size(h5f["data"])
+        @test size(similar(h5f["data"], 5,6)) == (5, 6)
+        @test size(similar(h5f["data"], Int16, 8,7)) == (8,7)
+        @test size(similar(h5f["data"], Int16, 8,7; normalize = false)) == (8,7)
+        @test_broken size(similar(h5f["data"], Int8, 8,7)) == (8,7)
+
+        @test size(similar(h5f["data"], (5,6))) == (5, 6)
+        @test size(similar(h5f["data"], Int16, (8,7))) == (8,7)
+        @test size(similar(h5f["data"], Int16, (8,7); normalize = false)) == (8,7)
+        @test size(similar(h5f["data"], Int16, 0x8,0x7; normalize = false)) == (8,7)
+    end
+
+    rm(fn)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 3dfb41e33..e97a5ff00 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -50,6 +50,8 @@ include("filter.jl")
 include("chunkstorage.jl")
 @debug "fileio"
 include("fileio.jl")
+@debug "nonallocating"
+include("nonallocating.jl")
 @debug "filter test utils"
 include("filters/FilterTestUtils.jl")