From 976ec2a7bfdf508c527cee305d00d39d138d28c0 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 7 Jun 2022 16:50:01 -0400 Subject: [PATCH] Add copyto! and similar for Datasets (#937) * Add read!, copyto! and similar for Datasets * Remove views code from this pull request * Remove read!, refactor similar * Fix normalize keyword, add to tests * Use Dims * Add handle hygine to _generic_read * Fix normalize keyword * More normalize keyword fixes * Fix normalize --- src/HDF5.jl | 268 +++++++++++++++++++++++++++++++++++------- test/nonallocating.jl | 39 ++++++ test/runtests.jl | 2 + 3 files changed, 267 insertions(+), 42 deletions(-) create mode 100644 test/nonallocating.jl diff --git a/src/HDF5.jl b/src/HDF5.jl index 566fd663b..806df46b2 100644 --- a/src/HDF5.jl +++ b/src/HDF5.jl @@ -859,10 +859,32 @@ function Base.read(obj::DatasetOrAttribute, ::Type{String}, I...) return val end +""" + copyto!(output_buffer::AbstractArray{T}, obj::Union{DatasetOrAttribute}) where T + +Copy [part of] a HDF5 dataset or attribute to a preallocated output buffer. +The output buffer must be convertible to a pointer and have a contiguous layout. +""" +function Base.copyto!(output_buffer::AbstractArray{T}, obj::DatasetOrAttribute, I...) where T + dtype = datatype(obj) + val = nothing + try + val = generic_read!(output_buffer, obj, dtype, T, I...) + finally + close(dtype) + end + return val +end + # Special handling for reading OPAQUE datasets and attributes -function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}) +function generic_read!(buf::Matrix{UInt8}, obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}) + generic_read(obj, filetype, Opaque, buf) +end +function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}, buf::Union{Matrix{UInt8}, Nothing} = nothing) sz = size(obj) - buf = Matrix{UInt8}(undef, sizeof(filetype), prod(sz)) + if isnothing(buf) + buf = Matrix{UInt8}(undef, sizeof(filetype), prod(sz)) + end if obj isa Dataset read_dataset(obj, filetype, buf, obj.xfer) else @@ -880,11 +902,164 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque end # generic read function +function generic_read!(buf::Union{AbstractMatrix{UInt8}, AbstractArray{T}}, obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I...) where T + return _generic_read(obj, filetype, T, buf, I...) +end function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I...) where T + return _generic_read(obj, filetype, T, nothing, I...) +end +function _generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, + buf::Union{AbstractMatrix{UInt8}, AbstractArray{T}, Nothing}, I...) where T + + sz, scalar, dspace = _size_of_buffer(obj, I) + + if isempty(sz) + close(dspace) + return EmptyArray{T}() + end + + try + if isnothing(buf) + buf = _normalized_buffer(T, sz) + else + sizeof(buf) != prod(sz)*sizeof(T) && + error("Provided array buffer of size, $(size(buf)), and element type, $(eltype(buf)), does not match the dataset of size, $sz, and type, $T") + end + catch err + close(dspace) + rethrow(err) + end + + memtype = _memtype(filetype, T) + memspace = isempty(I) ? dspace : dataspace(sz) + + try + if obj isa Dataset + API.h5d_read(obj, memtype, memspace, dspace, obj.xfer, buf) + else + API.h5a_read(obj, memtype, buf) + end + + if do_normalize(T) + out = reshape(normalize_types(T, buf), sz...) + else + out = buf + end + + xfer_id = obj isa Dataset ? obj.xfer.id : API.H5P_DEFAULT + do_reclaim(T) && API.h5d_vlen_reclaim(memtype, memspace, xfer_id, buf) + + if scalar + return out[1] + else + return out + end + + finally + close(memtype) + close(memspace) + close(dspace) + end +end + + +""" + similar(obj::DatasetOrAttribute, [::Type{T}], [dims::Integer...]; normalize = true) + +Return a `Array{T}` or `Matrix{UInt8}` to that can contain [part of] the dataset. + +The `normalize` keyword will normalize the buffer for string and array datatypes. +""" +function Base.similar( + obj::DatasetOrAttribute, + ::Type{T}, + dims::Dims; + normalize::Bool = true +) where T + filetype = datatype(obj) + try + return similar(obj, filetype, T, dims; normalize=normalize) + finally + close(filetype) + end +end +Base.similar( + obj::DatasetOrAttribute, + ::Type{T}, + dims::Integer...; + normalize::Bool = true +) where T = similar(obj, T, Int.(dims); normalize=normalize) + +# Base.similar without specifying the Julia type +function Base.similar(obj::DatasetOrAttribute, dims::Dims; normalize::Bool = true) + filetype = datatype(obj) + try + T = get_jl_type(filetype) + return similar(obj, filetype, T, dims; normalize=normalize) + finally + close(filetype) + end +end +Base.similar( + obj::DatasetOrAttribute, + dims::Integer...; + normalize::Bool = true +) = similar(obj, Int.(dims); normalize=normalize) + +# Opaque types +function Base.similar(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}; normalize::Bool = true) + # normalize keyword for consistency, but it is ignored for Opaque + sz = size(obj) + return Matrix{UInt8}(undef, sizeof(filetype), prod(sz)) +end + +# Undocumented Base.similar signature allowing filetype to be specified +function Base.similar( + obj::DatasetOrAttribute, + filetype::Datatype, + ::Type{T}, + dims::Dims; + normalize::Bool = true +) where T + # We are reusing code that expect indices + I = Base.OneTo.(dims) + sz, scalar, dspace = _size_of_buffer(obj, I) + memtype = _memtype(filetype, T) + try + buf = _normalized_buffer(T, sz) + + if normalize && do_normalize(T) + buf = reshape(normalize_types(T, buf), sz) + end + + return buf + finally + close(dspace) + close(memtype) + end +end +Base.similar( + obj::DatasetOrAttribute, + filetype::Datatype, + ::Type{T}, + dims::Integer...; + normalize::Bool = true +) where T = similar(obj, filetype, T, Int.(dims); normalize=normalize) + +# Utilities used in Base.similar implementation + +#= + _memtype(filetype::Datatype, T) + +This is a utility function originall from generic_read. +It gets the native memory type for the system based on filetype, and checks +if the size matches. +=# +@inline function _memtype(filetype::Datatype, ::Type{T}) where T !isconcretetype(T) && error("type $T is not concrete") - !isempty(I) && obj isa Attribute && error("HDF5 attributes do not support hyperslab selections") - memtype = Datatype(API.h5t_get_native_type(filetype)) # padded layout in memory + # padded layout in memory + memtype = Datatype(API.h5t_get_native_type(filetype)) if sizeof(T) != sizeof(memtype) error(""" @@ -894,11 +1069,37 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I. """) end - dspace = dataspace(obj) + return memtype +end + +#= + _size_of_buffer(obj::DatasetOrAttribute, [I::Tuple, dspace::Dataspace]) + +This is a utility function originally from generic_read, but factored out. +The primary purpose is to determine the size and shape of the buffer to +create in order to hold the contents of a Dataset or Attribute. + +# Arguments +* obj - A Dataset or Attribute +* I - (optional) indices, defaults to () +* dspace - (optional) dataspace, defaults to dataspace(obj). + This argument will be consumed by hyperslab and returned. + +# Returns +* `sz` the size of the selection +* `scalar`, which is true if the value should be read as a scalar. +* `dspace`, hyper +=# +@inline function _size_of_buffer( + obj::DatasetOrAttribute, + I::Tuple = (), + dspace::Dataspace = dataspace(obj) +) + !isempty(I) && obj isa Attribute && error("HDF5 attributes do not support hyperslab selections") + stype = API.h5s_get_simple_extent_type(dspace) - stype == API.H5S_NULL && return EmptyArray{T}() - if !isempty(I) + if !isempty(I) && stype != API.H5S_NULL indices = Base.to_indices(obj, I) dspace = hyperslab(dspace, indices...) end @@ -907,16 +1108,32 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I. if stype == API.H5S_SCALAR sz = (1,) scalar = true + elseif stype == API.H5S_NULL + sz = () + # scalar = false elseif isempty(I) sz = size(dspace) + # scalar = false else + # Determine the size by the length of non-Int indices sz = map(length, filter(i -> !isa(i, Int), indices)) if isempty(sz) + # All indices are Int, so this is scalar sz = (1,) scalar = true end end + return sz, scalar, dspace +end + +#= + _normalized_buffer(T, sz) + +Return a Matrix{UInt8} for a normalized type or `Array{T}` for a regular type. +See `do_normalize` in typeconversions.jl. +=# +@inline function _normalized_buffer(::Type{T}, sz::NTuple{N, Int}) where {T, N} if do_normalize(T) # The entire dataset is read into in a buffer matrix where the first dimension at # any stage of normalization is the bytes for a single element of type `T`, and @@ -925,32 +1142,8 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I. else buf = Array{T}(undef, sz...) end - memspace = isempty(I) ? dspace : dataspace(sz) - - if obj isa Dataset - API.h5d_read(obj, memtype, memspace, dspace, obj.xfer, buf) - else - API.h5a_read(obj, memtype, buf) - end - - if do_normalize(T) - out = reshape(normalize_types(T, buf), sz...) - else - out = buf - end - - xfer_id = obj isa Dataset ? obj.xfer.id : API.H5P_DEFAULT - do_reclaim(T) && API.h5d_vlen_reclaim(memtype, memspace, xfer_id, buf) - close(memtype) - close(memspace) - close(dspace) - - if scalar - return out[1] - else - return out - end + return buf end # Array constructor for datasets @@ -1160,18 +1353,9 @@ function Base.setindex!(dset::Dataset, X::Array{T}, I::IndexType...) where T end filetype = datatype(dset) - memtype = Datatype(API.h5t_get_native_type(filetype)) # padded layout in memory + memtype = _memtype(filetype, eltype(X)) close(filetype) - elT = eltype(X) - if sizeof(elT) != sizeof(memtype) - error(""" - Type size mismatch - sizeof($elT) = $(sizeof(elT)) - sizeof($memtype) = $(sizeof(memtype)) - """) - end - dspace = dataspace(dset) stype = API.h5s_get_simple_extent_type(dspace) stype == API.H5S_NULL && error("attempting to write to null dataspace") diff --git a/test/nonallocating.jl b/test/nonallocating.jl new file mode 100644 index 000000000..ff00699f3 --- /dev/null +++ b/test/nonallocating.jl @@ -0,0 +1,39 @@ +using HDF5 +using Test + +@testset "non-allocating methods" begin + fn = tempname() + + data = rand(UInt16, 16, 16) + + h5open(fn, "w") do h5f + h5f["data"] = data + end + + h5open(fn, "r") do h5f + buffer = similar(h5f["data"]) + copyto!(buffer, h5f["data"]) + @test isequal(buffer, data) + + # Consider making this a view later + v = h5f["data"][1:4, 1:4] + + buffer = similar(v) + @test size(buffer) == (4,4) + copyto!(buffer, v) + @test isequal(buffer, @view(data[1:4, 1:4])) + + @test size(similar(h5f["data"], Int16)) == size(h5f["data"]) + @test size(similar(h5f["data"], 5,6)) == (5, 6) + @test size(similar(h5f["data"], Int16, 8,7)) == (8,7) + @test size(similar(h5f["data"], Int16, 8,7; normalize = false)) == (8,7) + @test_broken size(similar(h5f["data"], Int8, 8,7)) == (8,7) + + @test size(similar(h5f["data"], (5,6))) == (5, 6) + @test size(similar(h5f["data"], Int16, (8,7))) == (8,7) + @test size(similar(h5f["data"], Int16, (8,7); normalize = false)) == (8,7) + @test size(similar(h5f["data"], Int16, 0x8,0x7; normalize = false)) == (8,7) + end + + rm(fn) +end diff --git a/test/runtests.jl b/test/runtests.jl index 3dfb41e33..e97a5ff00 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -50,6 +50,8 @@ include("filter.jl") include("chunkstorage.jl") @debug "fileio" include("fileio.jl") +@debug "nonallocating" +include("nonallocating.jl") @debug "filter test utils" include("filters/FilterTestUtils.jl")