-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Overhauled to Arrow Back-End and Better Memory Safety #78
Changes from 42 commits
2ec0117
d437526
c33a8f0
f6f2f70
7d616f6
37dde32
a0cf08e
7ee89b0
24b7651
dd3c7a6
c81675a
5a632ac
9470501
3603fbd
5988002
c38aa23
82abe14
de6dc61
1caf39f
cf8094f
23d8ce4
486caf2
cbb2b89
bb582f1
7f75e47
d859ca1
f1cabdc
1ab7749
d92bd16
9d78a51
ddda3f6
ad838e2
472522f
748f286
b5bc7c5
5f317cd
1fb65b7
2b08439
030de0e
666cb8d
8c5dcc0
50c1426
fdc4ef2
eae7b7e
cbccde1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
julia 0.6 | ||
Arrow | ||
FlatBuffers 0.3.0 | ||
CategoricalArrays 0.3.0 | ||
DataFrames 0.11.0 | ||
DataStreams 0.3.0 | ||
WeakRefStrings 0.4.0 |
This file was deleted.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
|
||
getoutputlength(version::Int32, x::Integer) = version < FEATHER_VERSION ? x : padding(x) | ||
|
||
function validatefile(filename::AbstractString, data::AbstractVector{UInt8}) | ||
if length(data) < MIN_FILE_LENGTH | ||
throw(ArgumentError("'$file' is not in feather format: total length of file: $(length(data))")) | ||
end | ||
header = data[1:4] | ||
footer = data[(end-3):end] | ||
if header ≠ FEATHER_MAGIC_BYTES || footer ≠ FEATHER_MAGIC_BYTES | ||
throw(ArgumentError(string("'$filename' is not in feather format: header = $header, ", | ||
"footer = $footer."))) | ||
end | ||
end | ||
|
||
function loadfile(filename::AbstractString; use_mmap::Bool=SHOULD_USE_MMAP) | ||
isfile(filename) || throw(ArgumentError("'$file' is not a valid file.")) | ||
data = SHOULD_USE_MMAP ? Mmap.mmap(filename) : read(filename) | ||
validatefile(filename, data) | ||
data | ||
end | ||
|
||
function metalength(data::AbstractVector{UInt8}) | ||
read(IOBuffer(data[(length(data)-7):(length(data)-4)]), Int32) | ||
end | ||
|
||
function metaposition(data::AbstractVector{UInt8}, metalen::Integer=metalength(data)) | ||
length(data) - (metalen+7) | ||
end | ||
|
||
function rootposition(data::AbstractVector{UInt8}, mpos::Integer=metaposition(data)) | ||
read(IOBuffer(data[mpos:(mpos+4)]), Int32) | ||
end | ||
|
||
function getctable(data::AbstractVector{UInt8}) | ||
metapos = metaposition(data) | ||
rootpos = rootposition(data, metapos) | ||
ctable = FlatBuffers.read(Metadata.CTable, data, metapos + rootpos - 1) | ||
if ctable.version < FEATHER_VERSION | ||
@warn("This feather file is old and may not be readable.") | ||
end | ||
ctable | ||
end | ||
|
||
|
||
function Data.schema(ctable::Metadata.CTable) | ||
ncols = length(ctable.columns) | ||
header = Vector{String}(undef, ncols) | ||
types = Vector{Type}(undef, ncols) | ||
for (i, col) ∈ enumerate(ctable.columns) | ||
header[i] = col.name | ||
types[i] = juliatype(col) | ||
end | ||
Data.Schema(types, header, ctable.num_rows) | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,13 +2,14 @@ module Metadata | |
|
||
if Base.VERSION < v"0.7.0-DEV.2575" | ||
const Dates = Base.Dates | ||
using Compat | ||
else | ||
import Dates | ||
end | ||
|
||
using FlatBuffers | ||
|
||
@enum(Type_, BOOL = 0, INT8 = 1, INT16 = 2, INT32 = 3, INT64 = 4, | ||
@enum(DType, BOOL = 0, INT8 = 1, INT16 = 2, INT32 = 3, INT64 = 4, | ||
UINT8 = 5, UINT16 = 6, UINT32 = 7, UINT64 = 8, | ||
FLOAT = 9, DOUBLE = 10, UTF8 = 11, BINARY = 12, | ||
CATEGORY = 13, TIMESTAMP = 14, DATE = 15, TIME = 16) | ||
|
@@ -20,14 +21,15 @@ using FlatBuffers | |
# FlatBuffers.enumsizeof(::Type{TimeUnit}) = UInt8 | ||
|
||
mutable struct PrimitiveArray | ||
type_::Type_ | ||
dtype::DType | ||
encoding::Encoding | ||
offset::Int64 | ||
length::Int64 | ||
null_count::Int64 | ||
total_bytes::Int64 | ||
end | ||
|
||
# TODO why are these done this way rather with an abstract type??? | ||
mutable struct CategoryMetadata | ||
levels::PrimitiveArray | ||
ordered::Bool | ||
|
@@ -47,7 +49,7 @@ mutable struct TimeMetadata | |
unit::TimeUnit | ||
end | ||
|
||
@UNION TypeMetadata (Void,CategoryMetadata,TimestampMetadata,DateMetadata,TimeMetadata) | ||
@UNION TypeMetadata (Nothing,CategoryMetadata,TimestampMetadata,DateMetadata,TimeMetadata) | ||
|
||
mutable struct Column | ||
name::String | ||
|
@@ -57,8 +59,10 @@ mutable struct Column | |
user_metadata::String | ||
end | ||
|
||
function Column(name::String, values::PrimitiveArray, metadata::TypeMetadata=nothing, user_metadata::String="") | ||
return Column(name, values, FlatBuffers.typeorder(TypeMetadata, typeof(metadata)), metadata, user_metadata) | ||
function Column(name::String, values::PrimitiveArray, metadata::TypeMetadata=nothing, | ||
user_metadata::String="") | ||
Column(name, values, FlatBuffers.typeorder(TypeMetadata, typeof(metadata)), | ||
metadata, user_metadata) | ||
end | ||
|
||
mutable struct CTable | ||
|
@@ -73,8 +77,8 @@ end # module | |
|
||
# wesm/feather/cpp/src/metadata_generated.h | ||
# wesm/feather/cpp/src/types.h | ||
const Type_2julia = Dict{Metadata.Type_,DataType}( | ||
Metadata.BOOL => Arrow.Bool, | ||
const JULIA_TYPE_DICT = Dict{Metadata.DType,DataType}( | ||
Metadata.BOOL => Bool, | ||
Metadata.INT8 => Int8, | ||
Metadata.INT16 => Int16, | ||
Metadata.INT32 => Int32, | ||
|
@@ -85,15 +89,15 @@ const Type_2julia = Dict{Metadata.Type_,DataType}( | |
Metadata.UINT64 => UInt64, | ||
Metadata.FLOAT => Float32, | ||
Metadata.DOUBLE => Float64, | ||
Metadata.UTF8 => WeakRefString{UInt8}, | ||
Metadata.UTF8 => String, # can also be WeakRefString{UInt8} | ||
Metadata.BINARY => Vector{UInt8}, | ||
Metadata.CATEGORY => Int64, | ||
Metadata.TIMESTAMP => Int64, | ||
Metadata.DATE => Int64, | ||
Metadata.TIME => Int64 | ||
) | ||
|
||
const julia2Type_ = Dict{DataType,Metadata.Type_}( | ||
const MDATA_TYPE_DICT = Dict{DataType,Metadata.DType}( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should just spell out There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will change |
||
Bool => Metadata.BOOL, | ||
Int8 => Metadata.INT8, | ||
Int16 => Metadata.INT16, | ||
|
@@ -106,18 +110,63 @@ const julia2Type_ = Dict{DataType,Metadata.Type_}( | |
Float32 => Metadata.FLOAT, | ||
Float64 => Metadata.DOUBLE, | ||
String => Metadata.UTF8, | ||
Vector{UInt8} => Metadata.BINARY, | ||
Dates.DateTime => Metadata.INT64, | ||
Dates.Date => Metadata.INT32, | ||
WeakRefString{UInt8} => Metadata.UTF8 | ||
Vector{UInt8} => Metadata.BINARY, | ||
Dates.Time => Metadata.INT64, | ||
Dates.DateTime => Metadata.INT64, | ||
Dates.Date => Metadata.INT32, | ||
# WeakRefString{UInt8} => Metadata.UTF8 # not currently being used | ||
) | ||
|
||
const NON_PRIMITIVE_TYPES = Set([Metadata.UTF8, Metadata.BINARY]) | ||
|
||
const TimeUnit2julia = Dict{Metadata.TimeUnit,DataType}( | ||
Metadata.SECOND => Arrow.Second, | ||
Metadata.MILLISECOND => Arrow.Millisecond, | ||
Metadata.MICROSECOND => Arrow.Microsecond, | ||
Metadata.NANOSECOND => Arrow.Nanosecond | ||
const JULIA_TIME_DICT = Dict{Metadata.TimeUnit,DataType}( | ||
Metadata.SECOND => Dates.Second, | ||
Metadata.MILLISECOND => Dates.Millisecond, | ||
Metadata.MICROSECOND => Dates.Microsecond, | ||
Metadata.NANOSECOND => Dates.Nanosecond | ||
) | ||
const julia2TimeUnit = Dict{DataType,Metadata.TimeUnit}([(v, k) for (k,v) in TimeUnit2julia]) | ||
const MDATA_TIME_DICT = Dict{DataType,Metadata.TimeUnit}(v=>k for (k,v) in JULIA_TIME_DICT) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. spell out in full here too |
||
|
||
|
||
isprimitivetype(t::Metadata.DType) = t ∉ NON_PRIMITIVE_TYPES | ||
|
||
|
||
juliatype(meta::Nothing, values_type::Metadata.DType) = JULIA_TYPE_DICT[values_type] | ||
juliatype(values_type::Metadata.DType) = juliatype(nothing, values_type) | ||
function juliatype(meta::Metadata.CategoryMetadata, values_type::Metadata.DType) | ||
JULIA_TYPE_DICT[meta.levels.dtype] | ||
end | ||
function juliatype(meta::Metadata.TimestampMetadata, values_type::Metadata.DType) | ||
Timestamp{JULIA_TIME_DICT[meta.unit]} | ||
end | ||
function juliatype(meta::Metadata.TimeMetadata, values_type::Metadata.DType) | ||
TimeOfDay{JULIA_TIME_DICT[meta.unit],JULIA_TYPE_DICT[values_type]} | ||
end | ||
juliatype(meta::Metadata.DateMetadata, values_type::Metadata.DType) = Datestamp | ||
|
||
function juliatype(col::Metadata.Column) | ||
T = juliatype(col.metadata, col.values.dtype) | ||
col.values.null_count == 0 ? T : Union{T,Missing} | ||
end | ||
|
||
feathertype(::Type{T}) where T = MDATA_TYPE_DICT[T] | ||
feathertype(::Type{Union{T,Missing}}) where T = feathertype(T) | ||
feathertype(::Type{<:Arrow.Datestamp}) = Metadata.INT32 | ||
feathertype(::Type{<:Arrow.Timestamp}) = Metadata.INT64 | ||
feathertype(::Type{<:Arrow.TimeOfDay{P,Int32}}) where P = Metadata.INT32 | ||
feathertype(::Type{<:Arrow.TimeOfDay{P,Int64}}) where P = Metadata.INT64 | ||
|
||
getmetadata(io::IO, ::Type{T}, A::ArrowVector) where T = nothing | ||
getmetadata(io::IO, ::Type{Union{T,Missing}}, A::ArrowVector) where T = getmetadata(io, T, A) | ||
getmetadata(io::IO, ::Type{Arrow.Datestamp}, A::ArrowVector) = Metadata.DateMetadata() | ||
function getmetadata(io::IO, ::Type{Arrow.Timestamp{T}}, A::ArrowVector) where T | ||
Metadata.TimestampMetadata(MDATA_TIME_DICT[T], "") | ||
end | ||
function getmetadata(io::IO, ::Type{Arrow.TimeOfDay{P,T}}, A::ArrowVector) where {P,T} | ||
Metadata.TimeMetadata(MDATA_TIME_DICT[P]) | ||
end | ||
# TODO Arrow standard says nothing about specifying whether DictEncoding is ordered! | ||
function getmetadata(io::IO, ::Type{T}, A::DictEncoding) where T | ||
vals = writecontents(Metadata.PrimitiveArray, io, levels(A)) | ||
Metadata.CategoryMetadata(vals, true) | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you expound this comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't even remember. I think I wrote that comment very early on, probably not realizing that a
@UNION
was coming from FlatBuffers. Will delete.