Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions julia/.codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
comment: false
6 changes: 6 additions & 0 deletions julia/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Manifest.toml
*.jl.cov
*.jl.*.cov
*.jl.mem

test/_scrap.jl
41 changes: 41 additions & 0 deletions julia/.travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Documentation: http://docs.travis-ci.com/user/languages/julia/
language: julia
os:
- linux
- osx
- windows
arch:
- x64
# - x86
julia:
- 1.3
- 1
- nightly
branches:
only:
- master
- gh-pages # For building documentation
- /^testing-.*$/ # testing branches
- /^v[0-9]+\.[0-9]+\.[0-9]+$/ # version tags
matrix:
allow_failures:
- julia: nightly
fast_finish: true
exclude:
- os: osx
arch: x86
- os: linux
arch: x86
julia: 1.3
# include:
# - stage: "Documentation"
# julia: 1
# os: linux
# script:
# - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate(); Pkg.build("Arrow")'
# - julia --project=docs/ docs/make.jl
# after_success: skip
notifications:
email: false
after_success:
- julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
21 changes: 21 additions & 0 deletions julia/LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018 JuliaData contributors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
33 changes: 33 additions & 0 deletions julia/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name = "Arrow"
uuid = "69666777-d1a9-59fb-9406-91d4454c9d45"
authors = ["quinnj <[email protected]>", "ExpandingMan <[email protected]"]
version = "0.3.0"

[deps]
CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
CodecLz4 = "0.4"
CodecZstd = "0.7"
julia = "1.3"
DataAPI = "1"
PooledArrays = "0.5"
Tables = "1.1"
SentinelArrays = "1"

[extras]
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"


[targets]
test = ["Test", "Random", "JSON3", "StructTypes"]
135 changes: 135 additions & 0 deletions julia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Arrow

[![Build Status](https://travis-ci.com/JuliaData/Arrow.jl.svg?branch=master)](https://travis-ci.com/JuliaData/Arrow.jl.svg?branch=master)
[![codecov](https://codecov.io/gh/JuliaData/Arrow.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaData/Arrow.jl)

This is a pure Julia implementation of the [Apache Arrow](https://arrow.apache.org) data standard. This package provides Julia `AbstractVector` objects for
referencing data that conforms to the Arrow standard. This allows users to seamlessly interface Arrow formatted data with a great deal of existing Julia code.

Please see this [document](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) for a description of the Arrow memory layout.

## Format Support

This implementation supports the 1.0 version of the specification, including support for:
* All primitive data types
* All nested data types
* Dictionary encodings and messages
* Extension types
* Streaming, file, record batch, and replacement and isdelta dictionary messages

It currently doesn't include support for:
* Tensors or sparse tensors
* Flight RPC
* C data interface

Third-party data formats:
* csv and parquet support via the existing CSV.jl and Parquet.jl packages
* Other Tables.jl-compatible packages automatically supported (DataFrames.jl, JSONTables.jl, JuliaDB.jl, SQLite.jl, MySQL.jl, JDBC.jl, ODBC.jl, XLSX.jl, etc.)
* No current Julia packages support ORC or Avro data formats

## Basic usage:

### Installation

```julia
] add Arrow
```

### Reading

#### `Arrow.Table`

Arrow.Table(io::IO; convert::Bool=true)
Arrow.Table(file::String; convert::Bool=true)
Arrow.Table(bytes::Vector{UInt8}, pos=1, len=nothing; convert::Bool=true)

Read an arrow formatted table, from:
* `io`, bytes will be read all at once via `read(io)`
* `file`, bytes will be read via `Mmap.mmap(file)`
* `bytes`, a byte vector directly, optionally allowing specifying the starting byte position `pos` and `len`

Returns a `Arrow.Table` object that allows column access via `table.col1`, `table[:col1]`, or `table[1]`.

`Arrow.Table` also satisfies the Tables.jl interface, and so can easily be materialied via any supporting
sink function: e.g. `DataFrame(Arrow.Table(file))`, `SQLite.load!(db, "table", Arrow.Table(file))`, etc.

Supports the `convert` keyword argument which controls whether certain arrow primitive types will be
lazily converted to more friendly Julia defaults; by default, `convert=true`.

##### Examples

```julia
using Arrow

# read arrow table from file format
tbl = Arrow.Table(file)

# read arrow table from IO
tbl = Arrow.Table(io)

# read arrow table directly from bytes, like from an HTTP request
resp = HTTP.get(url)
tbl = Arrow.Table(resp.body)
```

#### `Arrow.Stream`

Arrow.Stream(io::IO; convert::Bool=true)
Arrow.Stream(file::String; convert::Bool=true)
Arrow.Stream(bytes::Vector{UInt8}, pos=1, len=nothing; convert::Bool=true)

Start reading an arrow formatted table, from:
* `io`, bytes will be read all at once via `read(io)`
* `file`, bytes will be read via `Mmap.mmap(file)`
* `bytes`, a byte vector directly, optionally allowing specifying the starting byte position `pos` and `len`

Reads the initial schema message from the arrow stream/file, then returns an `Arrow.Stream` object
which will iterate over record batch messages, producing an `Arrow.Table` on each iteration.

By iterating `Arrow.Table`, `Arrow.Stream` satisfies the `Tables.partitions` interface, and as such can
be passed to Tables.jl-compatible sink functions.

This allows iterating over extremely large "arrow tables" in chunks represented as record batches.

Supports the `convert` keyword argument which controls whether certain arrow primitive types will be
lazily converted to more friendly Julia defaults; by default, `convert=true`.

### Writing

#### `Arrow.write`

Arrow.write(io::IO, tbl)
Arrow.write(file::String, tbl)

Write any Tables.jl-compatible `tbl` out as arrow formatted data.
Providing an `io::IO` argument will cause the data to be written to it
in the "streaming" format, unless `file=true` keyword argument is passed.
Providing a `file::String` argument will result in the "file" format being written.

Multiple record batches will be written based on the number of
`Tables.partitions(tbl)` that are provided; by default, this is just
one for a given table, but some table sources support automatic
partitioning. Note you can turn multiple table objects into partitions
by doing `Tables.partitioner([tbl1, tbl2, ...])`, but note that
each table must have the exact same `Tables.Schema`.

By default, `Arrow.write` will use multiple threads to write multiple
record batches simultaneously (e.g. if julia is started with `julia -t 8`).

Supported keyword arguments to `Arrow.write` include:
* `compress::Symbol`: possible values include `:lz4` or `:zstd`; will cause all buffers in each record batch to use the respective compression encoding
* `dictencode::Bool=false`: whether all columns should use dictionary encoding when being written
* `dictencodenested::Bool=false`: whether nested data type columns should also dict encode nested arrays/buffers; many other implementations don't support this
* `denseunions::Bool=true`: whether Julia `Vector{<:Union}` arrays should be written using the dense union layout; passing `false` will result in the sparse union layout
* `largelists::Bool=false`: causes list column types to be written with Int64 offset arrays; mainly for testing purposes; by default, Int64 offsets will be used only if needed
* `file::Bool=false`: if a an `io` argument is being written to, passing `file=true` will cause the arrow file format to be written instead of just IPC streaming

##### Examples

```julia
# write directly to any IO in streaming format
Arrow.write(io, tbl)

# write to a file in file format
Arrow.write("data.arrow", tbl)
```
78 changes: 78 additions & 0 deletions julia/src/Arrow.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Arrow.jl

A pure Julia implementation of the [apache arrow](https://arrow.apache.org/) memory format specification.

This implementation supports the 1.0 version of the specification, including support for:
* All primitive data types
* All nested data types
* Dictionary encodings and messages
* Extension types
* Streaming, file, record batch, and replacement and isdelta dictionary messages

It currently doesn't include support for:
* Tensors or sparse tensors
* Flight RPC
* C data interface

Third-party data formats:
* csv and parquet support via the existing CSV.jl and Parquet.jl packages
* Other Tables.jl-compatible packages automatically supported (DataFrames.jl, JSONTables.jl, JuliaDB.jl, SQLite.jl, MySQL.jl, JDBC.jl, ODBC.jl, XLSX.jl, etc.)
* No current Julia packages support ORC or Avro data formats

See docs for official Arrow.jl API with `Arrow.Table`, `Arrow.write`, and `Arrow.Stream`.
"""
module Arrow

using Mmap
import Dates
using DataAPI, Tables, SentinelArrays, PooledArrays, CodecLz4, CodecZstd

using Base: @propagate_inbounds
import Base: ==

const DEBUG_LEVEL = Ref(0)

function setdebug!(level::Int)
DEBUG_LEVEL[] = level
return
end

function withdebug(f, level)
lvl = DEBUG_LEVEL[]
try
setdebug!(level)
f()
finally
setdebug!(lvl)
end
end

macro debug(level, msg)
esc(quote
if DEBUG_LEVEL[] >= $level
println(string("DEBUG: ", $(QuoteNode(__source__.file)), ":", $(QuoteNode(__source__.line)), " ", $msg))
end
end)
end

const ALIGNMENT = 8
const FILE_FORMAT_MAGIC_BYTES = b"ARROW1"
const CONTINUATION_INDICATOR_BYTES = 0xffffffff

# vendored flatbuffers code for now
include("FlatBuffers/FlatBuffers.jl")
using .FlatBuffers

include("metadata/Flatbuf.jl")
using .Flatbuf; const Meta = Flatbuf

include("arrowtypes.jl")
using .ArrowTypes
include("utils.jl")
include("arraytypes.jl")
include("eltypes.jl")
include("table.jl")
include("write.jl")

end # module Arrow
Loading