Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/Filters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ Encodes and decodes variable-length arrays of arbitrary data type
"""
struct VLenArrayFilter{T} <: Filter{T,UInt8} end

"""
VLenUTF8Filter

Encodes and decodes variable-length unicode strings
"""
struct VLenUTF8Filter <: Filter{String, UInt8} end

function zdecode(ain, ::VLenArrayFilter{T}) where T
f = IOBuffer(ain)
nitems = read(f, UInt32)
Expand All @@ -51,8 +58,34 @@ function zencode(ain,::VLenArrayFilter)
take!(b)
end

function zdecode(ain, ::VLenUTF8Filter)
f = IOBuffer(ain)
nitems = read(f, UInt32)
out = Array{String}(undef, nitems)
for i in 1:nitems
clen = read(f, UInt32)
out[i] = String(read(f, clen))
end
close(f)
out
end

function zencode(ain, ::VLenUTF8Filter)
b = IOBuffer()
nitems = length(ain)
write(b, UInt32(nitems))
for a in ain
utf8encoded = transcode(String, a)
write(b, UInt32(ncodeunits(utf8encoded)))
write(b, utf8encoded)
end
take!(b)
end

JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) )
JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8")

getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}()
getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter()

filterdict = Dict("vlen-array"=>VLenArrayFilter)
filterdict = Dict("vlen-array"=>VLenArrayFilter, "vlen-utf8"=>VLenUTF8Filter)
22 changes: 8 additions & 14 deletions src/ZArray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ const concurrent_io_tasks = Ref(50)
getfillval(::Type{T}, t::String) where {T <: Number} = parse(T, t)
getfillval(::Type{T}, t::Union{T,Nothing}) where {T} = t

struct SenMissArray{T,N,V} <: AbstractArray{Union{T,Missing},N}
struct SenMissArray{T,N} <: AbstractArray{Union{T,Missing},N}
x::Array{T,N}
senval::T
end
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I knwo this was a quite ad-hoc definition of SenMissArray, probably better to keep the sentinel value in a struct field instead of a type parameter. Now that the idea was taken up also here: https://github.com/JuliaData/SentinelArrays.jl it might be an option to just use that implementation. This should not stop this PR from being merged though.

SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N,convert(T,v)}(x)
SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N}(x,convert(T,v))
Base.size(x::SenMissArray) = size(x.x)
senval(x::SenMissArray{<:Any,<:Any,V}) where V = V
senval(x::SenMissArray) = x.senval
function Base.getindex(x::SenMissArray,i::Int)
v = x.x[i]
isequal(v,senval(x)) ? missing : v
Expand Down Expand Up @@ -78,6 +79,7 @@ storageratio(z::ZArray{<:Vector}) = "unknown"

nobytes(z::ZArray) = length(z)*sizeof(eltype(z))
nobytes(z::ZArray{<:Vector}) = "unknown"
nobytes(z::ZArray{<:String}) = "unknown"

zinfo(z::ZArray) = zinfo(stdout,z)
function zinfo(io::IO,z::ZArray)
Expand Down Expand Up @@ -361,20 +363,12 @@ function filterfromtype(::Type{<:AbstractArray{T}}) where T
(VLenArrayFilter{T}(),)
end

filterfromtype(::Type{<:Union{<:AbstractString, Union{<:AbstractString, Missing}}}) = (VLenUTF8Filter(),)
filterfromtype(::Type{<:Union{MaxLengthString, Union{MaxLengthString, Missing}}}) = nothing

#Not all Array types can be mapped directly to a valid ZArray encoding.
#Here we try to determine the correct element type
to_zarrtype(::AbstractArray{T}) where T = T
function to_zarrtype(a::AbstractArray{<:Union{AbstractString,Missing}})
isasc, maxlen = mapreduce(
x->ismissing(x) ? (true,0) : (isascii(x),length(x)),
(x,y)->((x[1] && y[1]),max(x[2],y[2])),
a,
init = (true, 0,false)
)
et = isasc ? UInt8 : UInt32
newt = MaxLengthString{maxlen,et}
return eltype(a)>:Missing ? Union{newt,Missing} : newt
end
to_zarrtype(a::AbstractArray{<:Date}) = DateTime64{Dates.Day}
to_zarrtype(a::AbstractArray{<:DateTime}) = DateTime64{Dates.Millisecond}

Expand Down
5 changes: 4 additions & 1 deletion src/metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Base.convert(::Type{DateTime64{P}}, t::Date) where P = DateTime64{P}(Dates.value
Base.convert(::Type{DateTime64{P}}, t::DateTime) where P = DateTime64{P}(Dates.value(P(t-DateTime(1970))))
Base.convert(::Type{DateTime64{P}}, t::DateTime64{Q}) where {P,Q} = DateTime64{P}(Dates.value(P(Q(t.i))))
Base.zero(t::Union{DateTime64, Type{<:DateTime64}}) = t(0)
Base.zero(t::Union{String, Type{String}}) = ""
# Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{Date}) = Date
# Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{DateTime}) = DateTime
# Base.promote_rule(::Type{<:DateTime64{<:Dates.TimePeriod}}, ::Type{Date}) = DateTime
Expand All @@ -63,6 +64,7 @@ typestr(::Type{MaxLengthString{N,UInt32}}) where N = string('<', 'U', N)
typestr(::Type{MaxLengthString{N,UInt8}}) where N = string('<', 'S', N)
typestr(::Type{<:Array}) = "|O"
typestr(::Type{<:DateTime64{P}}) where P = "<M8[$(pdt64string[P])]"
typestr(::Type{<:AbstractString}) = "|O"

const typestr_regex = r"^([<|>])([tbiufcmMOSUV])(\d*)(\[\w+\])?$"
const typemap = Dict{Tuple{Char, Int}, DataType}(
Expand Down Expand Up @@ -96,7 +98,7 @@ function typestr(s::AbstractString, filterlist=nothing)
if filterlist === nothing
throw(ArgumentError("Object array can only be parsed when an appropriate filter is defined"))
end
return Vector{sourcetype(first(filterlist))}
return sourcetype(first(filterlist))
end
isempty(typesize) && throw((ArgumentError("$s is not a valid numpy typestr")))
tc, ts = first(typecode), parse(Int, typesize)
Expand Down Expand Up @@ -243,4 +245,5 @@ Base.eltype(::Metadata{T}) where T = T
fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v)
fill_value_decoding(v::Nothing, ::Any) = v
fill_value_decoding(v, T) = T(v)
fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v])
fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v
9 changes: 7 additions & 2 deletions test/python.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ dtypes = (UInt8, UInt16, UInt32, UInt64,
Int8, Int16, Int32, Int64,
Float16, Float32, Float64,
Complex{Float32}, Complex{Float64},
Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32})
Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32},
String)
compressors = (
"no"=>NoCompressor(),
"blosc"=>BloscCompressor(cname="zstd"),
Expand Down Expand Up @@ -59,7 +60,7 @@ gatts = g.attrs
dtypesp = ("uint8","uint16","uint32","uint64",
"int8","int16","int32","int64",
"float16","float32","float64",
"complex64", "complex128","bool","S10","U10")
"complex64", "complex128","bool","S10","U10", "O")

#Test accessing arrays from python and reading data
for i=1:length(dtypes), co in compressors
Expand Down Expand Up @@ -87,13 +88,16 @@ end
data = rand(Int32,2,6,10)
py"""
import numcodecs
import numpy as np
g = zarr.group($ppython)
g.attrs["groupatt"] = "Hi"
z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4')
z1[:,:,:]=$data
z1.attrs["test"]={"b": 6}
z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib())
z2[:]=[k for k in 'hallo']
z3 = g.create_dataset('a3', shape=(2,), dtype=str)
z3[:]=np.asarray(['test1', 'test234'], dtype='O')
zarr.consolidate_metadata($ppython)
"""

Expand All @@ -107,6 +111,7 @@ a1 = g["a1"]
@test a1.attrs["test"]==Dict("b"=>6)
# Test reading the string array
@test String(g["a2"][:])=="hallo"
@test g["a3"] == ["test1", "test234"]

# And test for consolidated metadata
# Delete files so we make sure they are not accessed
Expand Down
7 changes: 3 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,12 @@ end
end

@testset "string array getindex/setindex" begin
using Zarr: MaxLengthString
aa = ["this", "is", "all ", "ascii"]
bb = ["And" "Unicode"; "ματριξ" missing]
a = ZArray(aa)
b = ZArray(bb, fill_value = MaxLengthString{7,UInt32}(""))
@test eltype(a) == MaxLengthString{5,UInt8}
@test eltype(b) == Union{MaxLengthString{7,UInt32},Missing}
b = ZArray(bb, fill_value = "")
@test eltype(a) == String
@test eltype(b) == Union{String,Missing}
@test a[:] == ["this", "is", "all ", "ascii"]
@test all(isequal.(b[:,:],["And" "Unicode"; "ματριξ" missing]))
end
Expand Down