Skip to content

Commit 24eb10d

Browse files
authored
Merge pull request #21 from yongtang/parquet
Support Apache Parquet format
2 parents c7ac5a0 + 1532441 commit 24eb10d

File tree

18 files changed

+9449
-0
lines changed

18 files changed

+9449
-0
lines changed

BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@ sh_binary(
1010
"//tensorflow_io/ignite:ignite_py",
1111
"//tensorflow_io/kafka:kafka_py",
1212
"//tensorflow_io/kinesis:kinesis_py",
13+
"//tensorflow_io/parquet:parquet_py",
1314
],
1415
)

WORKSPACE

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,74 @@ http_archive(
6161
strip_prefix = "aws-sdk-cpp-1.3.15",
6262
build_file = "//third_party:aws.BUILD",
6363
)
64+
65+
http_archive(
66+
name = "snappy",
67+
urls = [
68+
"https://mirror.bazel.build/github.com/google/snappy/archive/1.1.7.tar.gz",
69+
"https://github.com/google/snappy/archive/1.1.7.tar.gz",
70+
],
71+
sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
72+
strip_prefix = "snappy-1.1.7",
73+
build_file = "//third_party:snappy.BUILD",
74+
)
75+
76+
http_archive(
77+
name = "arrow",
78+
urls = [
79+
"https://mirror.bazel.build/github.com/apache/arrow/archive/apache-arrow-0.9.0.tar.gz",
80+
"https://github.com/apache/arrow/archive/apache-arrow-0.9.0.tar.gz",
81+
],
82+
sha256 = "65f89a3910b6df02ac71e4d4283db9b02c5b3f1e627346c7b6a5982ae994af91",
83+
strip_prefix = "arrow-apache-arrow-0.9.0",
84+
build_file = "//third_party:arrow.BUILD",
85+
)
86+
87+
http_archive(
88+
name = "boost",
89+
urls = [
90+
"https://mirror.bazel.build/dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.gz",
91+
"https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.gz"
92+
],
93+
sha256 = "8aa4e330c870ef50a896634c931adf468b21f8a69b77007e45c444151229f665",
94+
strip_prefix = "boost_1_67_0",
95+
build_file = "//third_party:boost.BUILD",
96+
)
97+
98+
http_archive(
99+
name = "thrift",
100+
urls = [
101+
"https://mirror.bazel.build/github.com/apache/thrift/archive/0.11.0.tar.gz",
102+
"https://github.com/apache/thrift/archive/0.11.0.tar.gz",
103+
],
104+
sha256 = "0e324569321a1b626381baabbb98000c8dd3a59697292dbcc71e67135af0fefd",
105+
strip_prefix = "thrift-0.11.0",
106+
build_file = "//third_party:thrift.BUILD",
107+
)
108+
109+
# Parquet needs generated parquet_types.h and parquet_types.cpp which are generated
110+
# from src/parquet/parquet.thrift in apache-parquet-cpp-1.4.0.tar.gz.
111+
#
112+
# Generating parquet_types.h and parquet_types.cpp, however, needs both bison and flex
113+
# installed, which is really an unnecessary step.
114+
#
115+
# We use the following step to generate the parquet_types.h and parquet_types.cpp files:
116+
# - In third_party directory, run `docker run -i -t --rm -v $PWD:/v -w /v ubuntu:16.04 bash -x /v/parquet.type`
117+
# - Once complete, a parquet.patch file will be generated which could be used as a patch in bazel
118+
#
119+
# $ cd third_party
120+
# $ docker run -i -t --rm -v $PWD:/v -w /v ubuntu:16.04 bash -x /v/parquet.type
121+
http_archive(
122+
name = "parquet",
123+
urls = [
124+
"https://mirror.bazel.build/github.com/apache/parquet-cpp/archive/apache-parquet-cpp-1.4.0.tar.gz",
125+
"https://github.com/apache/parquet-cpp/archive/apache-parquet-cpp-1.4.0.tar.gz",
126+
],
127+
sha256 = "52899be6c9dc49a14976d4ad84597243696c3fa2882e5c802b56e912bfbcc7ce",
128+
strip_prefix = "parquet-cpp-apache-parquet-cpp-1.4.0",
129+
build_file = "//third_party:parquet.BUILD",
130+
patches = [
131+
"//third_party:parquet.patch",
132+
],
133+
patch_args = ["-p1"],
134+
)

tensorflow_io/parquet/BUILD

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
licenses(["notice"]) # Apache 2.0
2+
3+
package(default_visibility = ["//visibility:public"])
4+
5+
filegroup(
6+
name = "test_data",
7+
srcs = glob(["python/kernel_tests/testdata/*"]),
8+
)
9+
10+
cc_binary(
11+
name = 'python/ops/_parquet_ops.so',
12+
srcs = [
13+
"kernels/parquet_dataset_ops.cc",
14+
"ops/dataset_ops.cc",
15+
],
16+
linkshared = 1,
17+
deps = [
18+
"@local_config_tf//:libtensorflow_framework",
19+
"@local_config_tf//:tf_header_lib",
20+
"@parquet//:parquet",
21+
],
22+
copts = ["-pthread", "-std=c++11", "-D_GLIBCXX_USE_CXX11_ABI=0", "-DNDEBUG"]
23+
)
24+
25+
py_library(
26+
name = "parquet_ops_py",
27+
srcs = ([
28+
"python/ops/parquet_dataset_ops.py",
29+
]),
30+
data = [
31+
":python/ops/_parquet_ops.so"
32+
],
33+
srcs_version = "PY2AND3",
34+
)
35+
36+
py_test(
37+
name = "parquet_py_test",
38+
srcs = [
39+
"python/kernel_tests/parquet_test.py"
40+
],
41+
main = "python/kernel_tests/parquet_test.py",
42+
data = [
43+
":test_data",
44+
],
45+
deps = [
46+
":parquet_ops_py",
47+
],
48+
srcs_version = "PY2AND3",
49+
)
50+
51+
py_library(
52+
name = "parquet_py",
53+
srcs = ([
54+
"__init__.py",
55+
"python/__init__.py",
56+
"python/ops/__init__.py",
57+
]),
58+
deps = [
59+
":parquet_ops_py"
60+
],
61+
srcs_version = "PY2AND3",
62+
)

tensorflow_io/parquet/__init__.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ==============================================================================
15+
"""Parquet Dataset.
16+
17+
@@ParquetDataset
18+
"""
19+
20+
from __future__ import absolute_import
21+
from __future__ import division
22+
from __future__ import print_function
23+
24+
from tensorflow_io.parquet.python.ops.parquet_dataset_ops import ParquetDataset
25+
26+
from tensorflow.python.util.all_util import remove_undocumented
27+
28+
_allowed_symbols = [
29+
"ParquetDataset",
30+
]
31+
32+
remove_undocumented(__name__)

0 commit comments

Comments
 (0)