Skip to content

Commit

Permalink
Overhauled to Arrow Back-End and Better Memory Safety (#78)
Browse files Browse the repository at this point in the history
* Fixed #70.

* Initial cleanup.

* Split into multiple files.

* Moved Arrow.jl to its own directory.

* Fixed method ambiguity in getmetadata.

* Initial implementation with arrow backend.

* Fixed errors; now works for basic bits types and strings.

* Now correctly implement datetime.

* Rewrote column constructors to be reasonable and sane.

* Dict encoding now working.

* Started writing sink.

* Reads now work with new version of Arrow.

* Continuing to work on sinks.

* Everything works except dictionary encoding, which is currently completely fucked on write side.

* Most column types now supported.

* Most functionality now properly implemented.

* Finally supports bools!

* Trying to fixed DictEncoding but it's still fucked up.

* Finally completely fixed DictEncoding.

* Fixed unit testing.

* Removed old reference files.

* Added some materialize methods.

* Removed old comment about DictEncoding being fucked up.

* Removed old reference file fileio.jl

* Removed spurious comment.

* Removed spurious comment.

* Tried to fix appveyor yaml.

* Tried to fix appveyor yaml.

* DictEncoding now works for non Int32, cleaned up some things.

* Added a materialize method for a DataFrame.

* Updated for 0.7.

* Fixed scary metadata bug.

* Replaced uninitialized with undef.

* Fixed file potential file validation bug.

* Updated for new Arrow locator interface.

* Cleaned up Source functions a bit.

* Started adding extra tests.

* Removed references to pre 0.6 in README.

* Added more unit tests.

* Removed explicit Arrow clone commands from travis and appveyor (now that it's registered).

* Small fixes.

* Fixed breaking test due to poor type inference on 0.6.
  • Loading branch information
ExpandingMan authored and quinnj committed Apr 10, 2018
1 parent 03a19c4 commit a2558d1
Show file tree
Hide file tree
Showing 101 changed files with 585 additions and 640 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

| **Documentation** | **PackageEvaluator** | **Build Status** |
|:-------------------------------------------------------------------------------:|:---------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|
| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.4-img]][pkg-0.4-url] [![][pkg-0.5-img]][pkg-0.5-url] [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |
| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |


## Installation
Expand All @@ -23,7 +23,7 @@ julia> Pkg.add("Feather")

## Project Status

The package is tested against Julia `0.4` and *current* `0.5` on Linux, OS X, and Windows.
The package is tested against Julia `0.6` and `0.7` on Linux, OS X, and Windows.

## Contributing and Questions

Expand Down
3 changes: 2 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
julia 0.6
Arrow
FlatBuffers 0.3.0
CategoricalArrays 0.3.0
DataFrames 0.11.0
DataStreams 0.3.0
WeakRefStrings 0.4.0
Compat 0.63.0
59 changes: 0 additions & 59 deletions src/Arrow.jl

This file was deleted.

534 changes: 18 additions & 516 deletions src/Feather.jl

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions src/loadfile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

getoutputlength(version::Int32, x::Integer) = version < FEATHER_VERSION ? x : padding(x)

function validatefile(filename::AbstractString, data::AbstractVector{UInt8})
if length(data) < MIN_FILE_LENGTH
throw(ArgumentError("'$file' is not in feather format: total length of file: $(length(data))"))
end
header = data[1:4]
footer = data[(end-3):end]
if header FEATHER_MAGIC_BYTES || footer FEATHER_MAGIC_BYTES
throw(ArgumentError(string("'$filename' is not in feather format: header = $header, ",
"footer = $footer.")))
end
end

function loadfile(filename::AbstractString; use_mmap::Bool=SHOULD_USE_MMAP)
isfile(filename) || throw(ArgumentError("'$file' is not a valid file."))
data = SHOULD_USE_MMAP ? Mmap.mmap(filename) : read(filename)
validatefile(filename, data)
data
end

function metalength(data::AbstractVector{UInt8})
read(IOBuffer(data[(length(data)-7):(length(data)-4)]), Int32)
end

function metaposition(data::AbstractVector{UInt8}, metalen::Integer=metalength(data))
length(data) - (metalen+7)
end

function rootposition(data::AbstractVector{UInt8}, mpos::Integer=metaposition(data))
read(IOBuffer(data[mpos:(mpos+4)]), Int32)
end

function getctable(data::AbstractVector{UInt8})
metapos = metaposition(data)
rootpos = rootposition(data, metapos)
ctable = FlatBuffers.read(Metadata.CTable, data, metapos + rootpos - 1)
if ctable.version < FEATHER_VERSION
@warn("This feather file is old and may not be readable.")
end
ctable
end


function Data.schema(ctable::Metadata.CTable)
ncols = length(ctable.columns)
header = Vector{String}(undef, ncols)
types = Vector{Type}(undef, ncols)
for (i, col) enumerate(ctable.columns)
header[i] = col.name
types[i] = juliatype(col)
end
Data.Schema(types, header, ctable.num_rows)
end
86 changes: 67 additions & 19 deletions src/metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ module Metadata

if Base.VERSION < v"0.7.0-DEV.2575"
const Dates = Base.Dates
using Compat
else
import Dates
end

using FlatBuffers

@enum(Type_, BOOL = 0, INT8 = 1, INT16 = 2, INT32 = 3, INT64 = 4,
@enum(DType, BOOL = 0, INT8 = 1, INT16 = 2, INT32 = 3, INT64 = 4,
UINT8 = 5, UINT16 = 6, UINT32 = 7, UINT64 = 8,
FLOAT = 9, DOUBLE = 10, UTF8 = 11, BINARY = 12,
CATEGORY = 13, TIMESTAMP = 14, DATE = 15, TIME = 16)
Expand All @@ -20,7 +21,7 @@ using FlatBuffers
# FlatBuffers.enumsizeof(::Type{TimeUnit}) = UInt8

mutable struct PrimitiveArray
type_::Type_
dtype::DType
encoding::Encoding
offset::Int64
length::Int64
Expand All @@ -47,7 +48,7 @@ mutable struct TimeMetadata
unit::TimeUnit
end

@UNION TypeMetadata (Void,CategoryMetadata,TimestampMetadata,DateMetadata,TimeMetadata)
@UNION TypeMetadata (Nothing,CategoryMetadata,TimestampMetadata,DateMetadata,TimeMetadata)

mutable struct Column
name::String
Expand All @@ -57,8 +58,10 @@ mutable struct Column
user_metadata::String
end

function Column(name::String, values::PrimitiveArray, metadata::TypeMetadata=nothing, user_metadata::String="")
return Column(name, values, FlatBuffers.typeorder(TypeMetadata, typeof(metadata)), metadata, user_metadata)
function Column(name::String, values::PrimitiveArray, metadata::TypeMetadata=nothing,
user_metadata::String="")
Column(name, values, FlatBuffers.typeorder(TypeMetadata, typeof(metadata)),
metadata, user_metadata)
end

mutable struct CTable
Expand All @@ -73,8 +76,8 @@ end # module

# wesm/feather/cpp/src/metadata_generated.h
# wesm/feather/cpp/src/types.h
const Type_2julia = Dict{Metadata.Type_,DataType}(
Metadata.BOOL => Arrow.Bool,
const JULIA_TYPE_DICT = Dict{Metadata.DType,DataType}(
Metadata.BOOL => Bool,
Metadata.INT8 => Int8,
Metadata.INT16 => Int16,
Metadata.INT32 => Int32,
Expand All @@ -85,15 +88,15 @@ const Type_2julia = Dict{Metadata.Type_,DataType}(
Metadata.UINT64 => UInt64,
Metadata.FLOAT => Float32,
Metadata.DOUBLE => Float64,
Metadata.UTF8 => WeakRefString{UInt8},
Metadata.UTF8 => String, # can also be WeakRefString{UInt8}
Metadata.BINARY => Vector{UInt8},
Metadata.CATEGORY => Int64,
Metadata.TIMESTAMP => Int64,
Metadata.DATE => Int64,
Metadata.TIME => Int64
)

const julia2Type_ = Dict{DataType,Metadata.Type_}(
const METADATA_TYPE_DICT = Dict{DataType,Metadata.DType}(
Bool => Metadata.BOOL,
Int8 => Metadata.INT8,
Int16 => Metadata.INT16,
Expand All @@ -106,18 +109,63 @@ const julia2Type_ = Dict{DataType,Metadata.Type_}(
Float32 => Metadata.FLOAT,
Float64 => Metadata.DOUBLE,
String => Metadata.UTF8,
Vector{UInt8} => Metadata.BINARY,
Dates.DateTime => Metadata.INT64,
Dates.Date => Metadata.INT32,
WeakRefString{UInt8} => Metadata.UTF8
Vector{UInt8} => Metadata.BINARY,
Dates.Time => Metadata.INT64,
Dates.DateTime => Metadata.INT64,
Dates.Date => Metadata.INT32,
# WeakRefString{UInt8} => Metadata.UTF8 # not currently being used
)

const NON_PRIMITIVE_TYPES = Set([Metadata.UTF8, Metadata.BINARY])

const TimeUnit2julia = Dict{Metadata.TimeUnit,DataType}(
Metadata.SECOND => Arrow.Second,
Metadata.MILLISECOND => Arrow.Millisecond,
Metadata.MICROSECOND => Arrow.Microsecond,
Metadata.NANOSECOND => Arrow.Nanosecond
const JULIA_TIME_DICT = Dict{Metadata.TimeUnit,DataType}(
Metadata.SECOND => Dates.Second,
Metadata.MILLISECOND => Dates.Millisecond,
Metadata.MICROSECOND => Dates.Microsecond,
Metadata.NANOSECOND => Dates.Nanosecond
)
const julia2TimeUnit = Dict{DataType,Metadata.TimeUnit}([(v, k) for (k,v) in TimeUnit2julia])
const METADATA_TIME_DICT = Dict{DataType,Metadata.TimeUnit}(v=>k for (k,v) in JULIA_TIME_DICT)


isprimitivetype(t::Metadata.DType) = t NON_PRIMITIVE_TYPES


juliatype(meta::Nothing, values_type::Metadata.DType) = JULIA_TYPE_DICT[values_type]
juliatype(values_type::Metadata.DType) = juliatype(nothing, values_type)
function juliatype(meta::Metadata.CategoryMetadata, values_type::Metadata.DType)
JULIA_TYPE_DICT[meta.levels.dtype]
end
function juliatype(meta::Metadata.TimestampMetadata, values_type::Metadata.DType)
Timestamp{JULIA_TIME_DICT[meta.unit]}
end
function juliatype(meta::Metadata.TimeMetadata, values_type::Metadata.DType)
TimeOfDay{JULIA_TIME_DICT[meta.unit],JULIA_TYPE_DICT[values_type]}
end
juliatype(meta::Metadata.DateMetadata, values_type::Metadata.DType) = Datestamp

function juliatype(col::Metadata.Column)
T = juliatype(col.metadata, col.values.dtype)
col.values.null_count == 0 ? T : Union{T,Missing}
end

feathertype(::Type{T}) where T = METADATA_TYPE_DICT[T]
feathertype(::Type{Union{T,Missing}}) where T = feathertype(T)
feathertype(::Type{<:Arrow.Datestamp}) = Metadata.INT32
feathertype(::Type{<:Arrow.Timestamp}) = Metadata.INT64
feathertype(::Type{<:Arrow.TimeOfDay{P,Int32}}) where P = Metadata.INT32
feathertype(::Type{<:Arrow.TimeOfDay{P,Int64}}) where P = Metadata.INT64

getmetadata(io::IO, ::Type{T}, A::ArrowVector) where T = nothing
getmetadata(io::IO, ::Type{Union{T,Missing}}, A::ArrowVector) where T = getmetadata(io, T, A)
getmetadata(io::IO, ::Type{Arrow.Datestamp}, A::ArrowVector) = Metadata.DateMetadata()
function getmetadata(io::IO, ::Type{Arrow.Timestamp{T}}, A::ArrowVector) where T
Metadata.TimestampMetadata(METADATA_TIME_DICT[T], "")
end
function getmetadata(io::IO, ::Type{Arrow.TimeOfDay{P,T}}, A::ArrowVector) where {P,T}
Metadata.TimeMetadata(METADATA_TIME_DICT[P])
end
# WARNING Arrow standard says nothing about specifying whether DictEncoding is ordered!
function getmetadata(io::IO, ::Type{T}, A::DictEncoding) where T
vals = writecontents(Metadata.PrimitiveArray, io, levels(A))
Metadata.CategoryMetadata(vals, true)
end
Loading

0 comments on commit a2558d1

Please sign in to comment.