apache · StefanKarpinski · Jan 17, 2018 · Aug 4, 2018 · Aug 5, 2018 · Aug 17, 2018
diff --git a/julia/.codecov.yml b/julia/.codecov.yml
@@ -0,0 +1 @@
+comment: false
diff --git a/julia/.gitignore b/julia/.gitignore
@@ -0,0 +1,6 @@
+Manifest.toml
+*.jl.cov
+*.jl.*.cov
+*.jl.mem
+
+test/_scrap.jl
diff --git a/julia/.travis.yml b/julia/.travis.yml
@@ -0,0 +1,41 @@
+# Documentation: http://docs.travis-ci.com/user/languages/julia/
+language: julia
+os:
+  - linux
+  - osx
+  - windows
+arch:
+  - x64
+  # - x86
+julia:
+  - 1.3
+  - 1
+  - nightly
+branches:
+  only:
+  - master
+  - gh-pages # For building documentation
+  - /^testing-.*$/ # testing branches
+  - /^v[0-9]+\.[0-9]+\.[0-9]+$/ # version tags
+matrix:
+  allow_failures:
+    - julia: nightly
+  fast_finish: true
+  exclude:
+    - os: osx
+      arch: x86
+    - os: linux
+      arch: x86
+      julia: 1.3
+  # include:
+  #   - stage: "Documentation"
+  #     julia: 1
+  #     os: linux
+  #     script:
+  #       - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate(); Pkg.build("Arrow")'
+  #       - julia --project=docs/ docs/make.jl
+  #     after_success: skip
+notifications:
+  email: false
+after_success:
+   - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
diff --git a/julia/LICENSE.md b/julia/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 JuliaData contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/julia/Project.toml b/julia/Project.toml
@@ -0,0 +1,33 @@
+name = "Arrow"
+uuid = "69666777-d1a9-59fb-9406-91d4454c9d45"
+authors = ["quinnj <[email protected]>", "ExpandingMan <[email protected]"]
+version = "0.3.0"
+
+[deps]
+CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
+CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
+DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
+PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+
+[compat]
+CodecLz4 = "0.4"
+CodecZstd = "0.7"
+julia = "1.3"
+DataAPI = "1"
+PooledArrays = "0.5"
+Tables = "1.1"
+SentinelArrays = "1"
+
+[extras]
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+
+[targets]
+test = ["Test", "Random", "JSON3", "StructTypes"]
diff --git a/julia/README.md b/julia/README.md
@@ -0,0 +1,135 @@
+# Arrow
+
+[![Build Status](https://travis-ci.com/JuliaData/Arrow.jl.svg?branch=master)](https://travis-ci.com/JuliaData/Arrow.jl.svg?branch=master)
+[![codecov](https://codecov.io/gh/JuliaData/Arrow.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaData/Arrow.jl)
+
+This is a pure Julia implementation of the [Apache Arrow](https://arrow.apache.org) data standard.  This package provides Julia `AbstractVector` objects for
+referencing data that conforms to the Arrow standard.  This allows users to seamlessly interface Arrow formatted data with a great deal of existing Julia code.
+
+Please see this [document](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) for a description of the Arrow memory layout.
+
+## Format Support
+
+This implementation supports the 1.0 version of the specification, including support for:
+  * All primitive data types
+  * All nested data types
+  * Dictionary encodings and messages
+  * Extension types
+  * Streaming, file, record batch, and replacement and isdelta dictionary messages
+
+It currently doesn't include support for:
+  * Tensors or sparse tensors
+  * Flight RPC
+  * C data interface
+
+Third-party data formats:
+  * csv and parquet support via the existing CSV.jl and Parquet.jl packages
+  * Other Tables.jl-compatible packages automatically supported (DataFrames.jl, JSONTables.jl, JuliaDB.jl, SQLite.jl, MySQL.jl, JDBC.jl, ODBC.jl, XLSX.jl, etc.)
+  * No current Julia packages support ORC or Avro data formats
+
+## Basic usage:
+
+### Installation
+
+```julia
+] add Arrow
+```
+
+### Reading
+
+#### `Arrow.Table`
+
+    Arrow.Table(io::IO; convert::Bool=true)
+    Arrow.Table(file::String; convert::Bool=true)
+    Arrow.Table(bytes::Vector{UInt8}, pos=1, len=nothing; convert::Bool=true)
+
+Read an arrow formatted table, from:
+ * `io`, bytes will be read all at once via `read(io)`
+ * `file`, bytes will be read via `Mmap.mmap(file)`
+ * `bytes`, a byte vector directly, optionally allowing specifying the starting byte position `pos` and `len`
+
+Returns a `Arrow.Table` object that allows column access via `table.col1`, `table[:col1]`, or `table[1]`.
+
+`Arrow.Table` also satisfies the Tables.jl interface, and so can easily be materialied via any supporting
+sink function: e.g. `DataFrame(Arrow.Table(file))`, `SQLite.load!(db, "table", Arrow.Table(file))`, etc.
+
+Supports the `convert` keyword argument which controls whether certain arrow primitive types will be
+lazily converted to more friendly Julia defaults; by default, `convert=true`.
+
+##### Examples
+
+```julia
+using Arrow
+
+# read arrow table from file format
+tbl = Arrow.Table(file)
+
+# read arrow table from IO
+tbl = Arrow.Table(io)
+
+# read arrow table directly from bytes, like from an HTTP request
+resp = HTTP.get(url)
+tbl = Arrow.Table(resp.body)
+```
+
+#### `Arrow.Stream`
+
+    Arrow.Stream(io::IO; convert::Bool=true)
+    Arrow.Stream(file::String; convert::Bool=true)
+    Arrow.Stream(bytes::Vector{UInt8}, pos=1, len=nothing; convert::Bool=true)
+
+Start reading an arrow formatted table, from:
+ * `io`, bytes will be read all at once via `read(io)`
+ * `file`, bytes will be read via `Mmap.mmap(file)`
+ * `bytes`, a byte vector directly, optionally allowing specifying the starting byte position `pos` and `len`
+
+Reads the initial schema message from the arrow stream/file, then returns an `Arrow.Stream` object
+which will iterate over record batch messages, producing an `Arrow.Table` on each iteration.
+
+By iterating `Arrow.Table`, `Arrow.Stream` satisfies the `Tables.partitions` interface, and as such can
+be passed to Tables.jl-compatible sink functions.
+
+This allows iterating over extremely large "arrow tables" in chunks represented as record batches.
+
+Supports the `convert` keyword argument which controls whether certain arrow primitive types will be
+lazily converted to more friendly Julia defaults; by default, `convert=true`.
+
+### Writing
+
+#### `Arrow.write`
+
+    Arrow.write(io::IO, tbl)
+    Arrow.write(file::String, tbl)
+
+Write any Tables.jl-compatible `tbl` out as arrow formatted data.
+Providing an `io::IO` argument will cause the data to be written to it
+in the "streaming" format, unless `file=true` keyword argument is passed.
+Providing a `file::String` argument will result in the "file" format being written.
+
+Multiple record batches will be written based on the number of
+`Tables.partitions(tbl)` that are provided; by default, this is just
+one for a given table, but some table sources support automatic
+partitioning. Note you can turn multiple table objects into partitions
+by doing `Tables.partitioner([tbl1, tbl2, ...])`, but note that
+each table must have the exact same `Tables.Schema`.
+
+By default, `Arrow.write` will use multiple threads to write multiple
+record batches simultaneously (e.g. if julia is started with `julia -t 8`).
+
+Supported keyword arguments to `Arrow.write` include:
+  * `compress::Symbol`: possible values include `:lz4` or `:zstd`; will cause all buffers in each record batch to use the respective compression encoding
+  * `dictencode::Bool=false`: whether all columns should use dictionary encoding when being written
+  * `dictencodenested::Bool=false`: whether nested data type columns should also dict encode nested arrays/buffers; many other implementations don't support this
+  * `denseunions::Bool=true`: whether Julia `Vector{<:Union}` arrays should be written using the dense union layout; passing `false` will result in the sparse union layout
+  * `largelists::Bool=false`: causes list column types to be written with Int64 offset arrays; mainly for testing purposes; by default, Int64 offsets will be used only if needed
+  * `file::Bool=false`: if a an `io` argument is being written to, passing `file=true` will cause the arrow file format to be written instead of just IPC streaming
+
+##### Examples
+
+```julia
+# write directly to any IO in streaming format
+Arrow.write(io, tbl)
+
+# write to a file in file format
+Arrow.write("data.arrow", tbl)
+```
diff --git a/julia/src/Arrow.jl b/julia/src/Arrow.jl
@@ -0,0 +1,78 @@
+"""
+    Arrow.jl
+
+A pure Julia implementation of the [apache arrow](https://arrow.apache.org/) memory format specification.
+
+This implementation supports the 1.0 version of the specification, including support for:
+  * All primitive data types
+  * All nested data types
+  * Dictionary encodings and messages
+  * Extension types
+  * Streaming, file, record batch, and replacement and isdelta dictionary messages
+
+It currently doesn't include support for:
+  * Tensors or sparse tensors
+  * Flight RPC
+  * C data interface
+
+Third-party data formats:
+  * csv and parquet support via the existing CSV.jl and Parquet.jl packages
+  * Other Tables.jl-compatible packages automatically supported (DataFrames.jl, JSONTables.jl, JuliaDB.jl, SQLite.jl, MySQL.jl, JDBC.jl, ODBC.jl, XLSX.jl, etc.)
+  * No current Julia packages support ORC or Avro data formats
+
+See docs for official Arrow.jl API with `Arrow.Table`, `Arrow.write`, and `Arrow.Stream`.
+"""
+module Arrow
+
+using Mmap
+import Dates
+using DataAPI, Tables, SentinelArrays, PooledArrays, CodecLz4, CodecZstd
+
+using Base: @propagate_inbounds
+import Base: ==
+
+const DEBUG_LEVEL = Ref(0)
+
+function setdebug!(level::Int)
+    DEBUG_LEVEL[] = level
+    return
+end
+
+function withdebug(f, level)
+    lvl = DEBUG_LEVEL[]
+    try
+        setdebug!(level)
+        f()
+    finally
+        setdebug!(lvl)
+    end
+end
+
+macro debug(level, msg)
+    esc(quote
+        if DEBUG_LEVEL[] >= $level
+            println(string("DEBUG: ", $(QuoteNode(__source__.file)), ":", $(QuoteNode(__source__.line)), " ", $msg))
+        end
+    end)
+end
+
+const ALIGNMENT = 8
+const FILE_FORMAT_MAGIC_BYTES = b"ARROW1"
+const CONTINUATION_INDICATOR_BYTES = 0xffffffff
+
+# vendored flatbuffers code for now
+include("FlatBuffers/FlatBuffers.jl")
+using .FlatBuffers
+
+include("metadata/Flatbuf.jl")
+using .Flatbuf; const Meta = Flatbuf
+
+include("arrowtypes.jl")
+using .ArrowTypes
+include("utils.jl")
+include("arraytypes.jl")
+include("eltypes.jl")
+include("table.jl")
+include("write.jl")
+
+end  # module Arrow