Skip to content

Commit

Permalink
Perfomance improvements, do not materialize StringVectors
Browse files Browse the repository at this point in the history
  • Loading branch information
jaakkor2 committed Jan 27, 2024
1 parent c9168b8 commit c076735
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 20 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "JMPReader"
uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29"
authors = ["Jaakko Ruohio <[email protected]>"]
version = "0.1.6-DEV"
version = "0.1.6"

[deps]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Expand Down
10 changes: 8 additions & 2 deletions src/JMPReader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ module JMPReader
export readjmp

using Dates: unix2datetime, Date, DateTime
using DataFrames: DataFrame
using DataFrames: DataFrame, select!
using CodecZlib: transcode, GzipDecompressor
using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space
using WeakRefStrings: StringVector
using Base.Threads: nthreads, @threads, @spawn
using Base.Iterators: partition

include("types.jl")
include("constants.jl")
Expand All @@ -24,9 +26,13 @@ function readjmp(fn::AbstractString)
a = read(fn)
check_magic(a, fn)
info = metadata(a)

deflatebuffer = Vector{UInt8}()
alldata = [column_data(a, info, i, deflatebuffer) for i in 1:info.ncols]
return DataFrame(alldata, info.column.names)
names = info.column.names
df = DataFrame(alldata, names)

return df
end

end # module JMPReader
21 changes: 6 additions & 15 deletions src/column.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,7 @@ function column_data(data, info, i::Int, deflatebuffer::Vector{UInt8})
(0x01 dt3 0x07 && dt4 == 0x00)
width = dt5
io = a[end-info.nrows*width+1:end]
str = StringVector{String}(io, info.nrows)
str.lengths .= width
str.offsets .= [0; cumsum(str.lengths)[begin:end-1]]
str = rstrip.(str, '\0')
str = String.(str) # SubString->String
str = to_str(io, info.nrows, width)
return str
end

Expand All @@ -120,14 +116,12 @@ function column_data(data, info, i::Int, deflatebuffer::Vector{UInt8})
else # uncompressed
# continue after dt1,...,dt5 were read
_read_reals!(raw, offset, UInt8, 5)
hasunits = _read_real!(raw, offset, UInt8)
hasprops = _read_real!(raw, offset, UInt8)
_read_reals!(raw, offset, UInt8)
n1 = _read_real!(raw, offset, Int64)
if hasunits == 1 && n1 > 0
_read_real!(raw, offset, Int16) # ??
_read_real!(raw, offset, Int64) # some length
label = _read_string!(raw, offset, 4)
_read_real!(raw, offset, UInt32)
if hasprops == 1
# some block that ends in [0xff, 0xff, 0xff, 0xff]
offset[1] = findnext([0xff, 0xff, 0xff, 0xff], raw, offset[1])[end]
end
_read_real!(raw, offset, UInt16) # n2 as bytes
n2 = _read_real!(raw, offset, UInt32)
Expand All @@ -146,10 +140,7 @@ function column_data(data, info, i::Int, deflatebuffer::Vector{UInt8})
end
io = raw[end-sum(widths)+1:end]
end
str = StringVector{String}(io, info.nrows)
str.lengths .= widths
str.offsets .= [0; cumsum(UInt64.(widths))[begin:end-1]]
str = String.(str) # materialize
str = to_str(io, info.nrows, widths)
return str
end
end
Expand Down
42 changes: 40 additions & 2 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,45 @@ end

function check_magic(a, fn)
len = length(a)
len length(MAGIC_JMP) && a[1:length(MAGIC_JMP)] == MAGIC_JMP || throw(ArgumentError("\"$fn\" is not a .jmp file"))
len < 507 && throw(ArgumentError("\"$fn\" truncated?"))
len length(MAGIC_JMP) && a[1:length(MAGIC_JMP)] == MAGIC_JMP || throw(ArgumentError("Data table appears to have been corrupted, or is not a .jmp file. `$fn` "))
len < 507 && throw(ArgumentError("Data table appears to have been corrupted. `$fn`"))
nothing
end

function to_str(buffer, n, lengths::AbstractVector)
str = StringVector{String}(buffer, n)
str.lengths .= lengths
offset = UInt64(0)
@inbounds for i in 1:n
str.offsets[i] = offset
offset += lengths[i]
end
str
end

function to_str(buffer, n, length::Integer)
str = StringVector{String}(buffer, n)
str.lengths .= length
offset = UInt64(0)
@inbounds for i in 1:n
str.offsets[i] = offset
offset += length
end
rstripnull!(str)
str
end

"""
rstripnull!(strs::StringVector)
Remove trailing nulls from `strs`.
"""
function rstripnull!(s::StringVector)
@inbounds for (i, (length, offset)) in enumerate(zip(s.lengths, s.offsets))
while s.buffer[offset + length] == 0x00 && length > 0
length -= 1
end
s.lengths[i] = length
end
nothing
end

0 comments on commit c076735

Please sign in to comment.