From 5d3d869d163d3c5ff009c28a291a7f6b8c10ffc3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Apr 2025 07:28:57 -0400 Subject: [PATCH 1/7] Add example binary variant data --- .gitignore | 2 + README.md | 7 + variant/.gitignore | 3 + variant/README.md | 55 ++++++++ variant/array_empty.json | 1 + variant/array_empty.metadata | Bin 0 -> 3 bytes variant/array_empty.value | Bin 0 -> 3 bytes variant/array_nested.json | 1 + variant/array_nested.metadata | Bin 0 -> 23 bytes variant/array_nested.value | Bin 0 -> 73 bytes variant/array_primitive.json | 1 + variant/array_primitive.metadata | Bin 0 -> 3 bytes variant/array_primitive.value | Bin 0 -> 15 bytes variant/long_string.json | 1 + variant/long_string.metadata | Bin 0 -> 3 bytes variant/long_string.value | Bin 0 -> 157 bytes variant/object_empty.json | 1 + variant/object_empty.metadata | Bin 0 -> 3 bytes variant/object_empty.value | Bin 0 -> 3 bytes variant/object_nested.json | 1 + variant/object_nested.metadata | Bin 0 -> 83 bytes variant/object_nested.value | Bin 0 -> 79 bytes variant/object_primitive.json | 1 + variant/object_primitive.metadata | Bin 0 -> 105 bytes variant/object_primitive.value | Bin 0 -> 66 bytes variant/primitive_binary.json | 1 + variant/primitive_binary.metadata | Bin 0 -> 3 bytes variant/primitive_binary.value | Bin 0 -> 14 bytes variant/primitive_boolean_false.json | 1 + variant/primitive_boolean_false.metadata | Bin 0 -> 3 bytes variant/primitive_boolean_false.value | 1 + variant/primitive_boolean_true.json | 1 + variant/primitive_boolean_true.metadata | Bin 0 -> 3 bytes variant/primitive_boolean_true.value | 1 + variant/primitive_date.json | 1 + variant/primitive_date.metadata | Bin 0 -> 3 bytes variant/primitive_date.value | Bin 0 -> 5 bytes variant/primitive_decimal16.json | 1 + variant/primitive_decimal16.metadata | Bin 0 -> 3 bytes variant/primitive_decimal16.value | Bin 0 -> 18 bytes variant/primitive_decimal4.json | 1 + variant/primitive_decimal4.metadata | Bin 0 -> 3 bytes variant/primitive_decimal4.value | Bin 0 -> 6 bytes variant/primitive_decimal8.json | 1 + variant/primitive_decimal8.metadata | Bin 0 -> 3 bytes variant/primitive_decimal8.value | Bin 0 -> 10 bytes variant/primitive_double.json | 1 + variant/primitive_double.metadata | Bin 0 -> 3 bytes variant/primitive_double.value | 1 + variant/primitive_float.json | 1 + variant/primitive_float.metadata | Bin 0 -> 3 bytes variant/primitive_float.value | 1 + variant/primitive_int16.json | 1 + variant/primitive_int16.metadata | Bin 0 -> 3 bytes variant/primitive_int16.value | 1 + variant/primitive_int32.json | 1 + variant/primitive_int32.metadata | Bin 0 -> 3 bytes variant/primitive_int32.value | Bin 0 -> 5 bytes variant/primitive_int64.json | 1 + variant/primitive_int64.metadata | Bin 0 -> 3 bytes variant/primitive_int64.value | Bin 0 -> 5 bytes variant/primitive_int8.json | 1 + variant/primitive_int8.metadata | Bin 0 -> 3 bytes variant/primitive_int8.value | 1 + variant/primitive_null.json | 0 variant/primitive_null.metadata | 0 variant/primitive_null.value | 0 variant/primitive_string.json | 1 + variant/primitive_string.metadata | Bin 0 -> 3 bytes variant/primitive_string.value | Bin 0 -> 179 bytes variant/primitive_timestamp.json | 1 + variant/primitive_timestamp.metadata | Bin 0 -> 3 bytes variant/primitive_timestamp.value | Bin 0 -> 9 bytes variant/primitive_timestampntz.json | 1 + variant/primitive_timestampntz.metadata | Bin 0 -> 3 bytes variant/primitive_timestampntz.value | Bin 0 -> 9 bytes variant/regen.py | 156 +++++++++++++++++++++++ variant/short_string.json | 1 + variant/short_string.metadata | Bin 0 -> 3 bytes variant/short_string.value | 1 + 80 files changed, 254 insertions(+) create mode 100644 .gitignore create mode 100644 variant/.gitignore create mode 100644 variant/README.md create mode 100644 variant/array_empty.json create mode 100644 variant/array_empty.metadata create mode 100644 variant/array_empty.value create mode 100644 variant/array_nested.json create mode 100644 variant/array_nested.metadata create mode 100644 variant/array_nested.value create mode 100644 variant/array_primitive.json create mode 100644 variant/array_primitive.metadata create mode 100644 variant/array_primitive.value create mode 100644 variant/long_string.json create mode 100644 variant/long_string.metadata create mode 100644 variant/long_string.value create mode 100644 variant/object_empty.json create mode 100644 variant/object_empty.metadata create mode 100644 variant/object_empty.value create mode 100644 variant/object_nested.json create mode 100644 variant/object_nested.metadata create mode 100644 variant/object_nested.value create mode 100644 variant/object_primitive.json create mode 100644 variant/object_primitive.metadata create mode 100644 variant/object_primitive.value create mode 100644 variant/primitive_binary.json create mode 100644 variant/primitive_binary.metadata create mode 100644 variant/primitive_binary.value create mode 100644 variant/primitive_boolean_false.json create mode 100644 variant/primitive_boolean_false.metadata create mode 100644 variant/primitive_boolean_false.value create mode 100644 variant/primitive_boolean_true.json create mode 100644 variant/primitive_boolean_true.metadata create mode 100644 variant/primitive_boolean_true.value create mode 100644 variant/primitive_date.json create mode 100644 variant/primitive_date.metadata create mode 100644 variant/primitive_date.value create mode 100644 variant/primitive_decimal16.json create mode 100644 variant/primitive_decimal16.metadata create mode 100644 variant/primitive_decimal16.value create mode 100644 variant/primitive_decimal4.json create mode 100644 variant/primitive_decimal4.metadata create mode 100644 variant/primitive_decimal4.value create mode 100644 variant/primitive_decimal8.json create mode 100644 variant/primitive_decimal8.metadata create mode 100644 variant/primitive_decimal8.value create mode 100644 variant/primitive_double.json create mode 100644 variant/primitive_double.metadata create mode 100644 variant/primitive_double.value create mode 100644 variant/primitive_float.json create mode 100644 variant/primitive_float.metadata create mode 100644 variant/primitive_float.value create mode 100644 variant/primitive_int16.json create mode 100644 variant/primitive_int16.metadata create mode 100644 variant/primitive_int16.value create mode 100644 variant/primitive_int32.json create mode 100644 variant/primitive_int32.metadata create mode 100644 variant/primitive_int32.value create mode 100644 variant/primitive_int64.json create mode 100644 variant/primitive_int64.metadata create mode 100644 variant/primitive_int64.value create mode 100644 variant/primitive_int8.json create mode 100644 variant/primitive_int8.metadata create mode 100644 variant/primitive_int8.value create mode 100644 variant/primitive_null.json create mode 100644 variant/primitive_null.metadata create mode 100644 variant/primitive_null.value create mode 100644 variant/primitive_string.json create mode 100644 variant/primitive_string.metadata create mode 100644 variant/primitive_string.value create mode 100644 variant/primitive_timestamp.json create mode 100644 variant/primitive_timestamp.metadata create mode 100644 variant/primitive_timestamp.value create mode 100644 variant/primitive_timestampntz.json create mode 100644 variant/primitive_timestampntz.metadata create mode 100644 variant/primitive_timestampntz.value create mode 100644 variant/regen.py create mode 100644 variant/short_string.json create mode 100644 variant/short_string.metadata create mode 100644 variant/short_string.value diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..56e80f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +variant/derby.log \ No newline at end of file diff --git a/README.md b/README.md index a11eb6a..fe74318 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,10 @@ ~ under the License. --> # Testing Data and Utilities for Apache Parquet + +- [data](data/README.md) - Sample Parquet data files for testing +- [bad_data](bad_data/README.md) - Reproducers for bad data files for testing +- [variant](variant/README.md) - Sample [Variant] binary values + + +[Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md \ No newline at end of file diff --git a/variant/.gitignore b/variant/.gitignore new file mode 100644 index 0000000..1e03a04 --- /dev/null +++ b/variant/.gitignore @@ -0,0 +1,3 @@ +derby.log +spark-warehouse +metastore_db \ No newline at end of file diff --git a/variant/README.md b/variant/README.md new file mode 100644 index 0000000..15f8482 --- /dev/null +++ b/variant/README.md @@ -0,0 +1,55 @@ + + +# Variant Binary Encoding + +This directory contains binary artifacts encoded using the Parquet [Variant] +binary encoding. These files are **not** valid Parquet files, but rather +raw binary data. + +## Structure + +Each example consists of three files: + +* `.metadata` -- the binary contents of the `metadata` field +* `.value` -- the binary contents of the `value` field +* `.json` -- the equivalent JSON + +## Descriptions + +1. `primitive_` -- Examples primitive (`basic_type` = 1), one for each of the [primitive types listed in the spec] +2. `short_string` -- Example of short string (`basic_type` = 2) +3. `object_empty` -- Example of object (`basic_type` = 3) with no fields +3. `object_primitive` -- Example of object with only primitive fields +4. `object_nested` -- Example of object with other objects in fields +5. `array_empty` -- Example of array (`basic_type` = 4) with no elements +5. `array_primitive` -- Example of array with only primitive elements +6. `array_nested` -- Example of an with objects and other arrays in the elements + + +## Regenerating these files + +The files were generated by running the [`regen.py`](regen.py) script that uses Apache Spark to +generate the files. + + + + +[Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +[primitive types listed in the spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-primitive-type-basic_type0 diff --git a/variant/array_empty.json b/variant/array_empty.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/variant/array_empty.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/variant/array_empty.metadata b/variant/array_empty.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/array_empty.value b/variant/array_empty.value new file mode 100644 index 0000000000000000000000000000000000000000..fd7603414f3bed441af945335286aa4b64914c0a GIT binary patch literal 3 KcmZQ(U;qFB3;+ZG literal 0 HcmV?d00001 diff --git a/variant/array_nested.json b/variant/array_nested.json new file mode 100644 index 0000000..761b622 --- /dev/null +++ b/variant/array_nested.json @@ -0,0 +1 @@ +[{"id":1,"thing":{"names":["Contrarian","Spider"]}},{"id":2,"names":["Apple","Ray",null],"type":"if"}] \ No newline at end of file diff --git a/variant/array_nested.metadata b/variant/array_nested.metadata new file mode 100644 index 0000000000000000000000000000000000000000..6b96deae6ad1e6110fb137d2d5e1b2bc1eeb80ca GIT binary patch literal 23 ecmZQ%VPInC5y(s_$;ix0&r8frEiS1nNCf~p(FW)M literal 0 HcmV?d00001 diff --git a/variant/array_nested.value b/variant/array_nested.value new file mode 100644 index 0000000000000000000000000000000000000000..5985b7c5c8737ae5f396173895c31a00ae220ebf GIT binary patch literal 73 zcmZQ(Vo-HqVq#!qV3OluWMX7u5NBp$;1<$!&d)0;N-WAu%##c*$V^EsVq#_hB333b a9wyGrG-hT7HZE>a$AW^KRNkP(N(KOM1`G!P literal 0 HcmV?d00001 diff --git a/variant/array_primitive.json b/variant/array_primitive.json new file mode 100644 index 0000000..6a57c5c --- /dev/null +++ b/variant/array_primitive.json @@ -0,0 +1 @@ +[2,1,5,9] \ No newline at end of file diff --git a/variant/array_primitive.metadata b/variant/array_primitive.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/array_primitive.value b/variant/array_primitive.value new file mode 100644 index 0000000000000000000000000000000000000000..882fda1f2fcf009aebf7097022436058c5166a1c GIT binary patch literal 15 WcmZQ(VPIlmXgg=5VWBo2s4I#y&)g5?66A+ybskU`NBg1FeX1sd;IDMFHf o%!L5wNy%l_QufXlifKIFnxdz3gU|K(eINd5f1fX>@$stbA9oBrP5=M^ literal 0 HcmV?d00001 diff --git a/variant/object_empty.json b/variant/object_empty.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/variant/object_empty.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/variant/object_empty.metadata b/variant/object_empty.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/object_empty.value b/variant/object_empty.value new file mode 100644 index 0000000000000000000000000000000000000000..aa7650d5edefc154986e199c8ba41b90830ed110 GIT binary patch literal 3 KcmZQ#U;qFB2>=5C literal 0 HcmV?d00001 diff --git a/variant/object_nested.json b/variant/object_nested.json new file mode 100644 index 0000000..0e9768b --- /dev/null +++ b/variant/object_nested.json @@ -0,0 +1 @@ +{"id":1,"observation":{"location":"In the Volcano","time":"12:34:56","value":{"humidity":456,"temperature":123}},"species":{"name":"lava monster","population":12345}} \ No newline at end of file diff --git a/variant/object_nested.metadata b/variant/object_nested.metadata new file mode 100644 index 0000000000000000000000000000000000000000..71548c3255354444ef4ac18ee96476c4915ba5a4 GIT binary patch literal 83 zcmZSNVqoIr6<1QzGq!WfOero%P0ma$&P&WqEyyn@%}FfD%+JeDDo!mb12ao9b5nEj flfmq=#GKO9lGNOS)S|?a(xTLi(%j6H%#um~eLo)g literal 0 HcmV?d00001 diff --git a/variant/object_nested.value b/variant/object_nested.value new file mode 100644 index 0000000000000000000000000000000000000000..f6385a6fb6d0f614505c0ffe4b83a6568561b23d GIT binary patch literal 79 zcmZQ#W?*4tkYsY>VPs-rVrJkKFw99TOH|0s&nqrTEfTOaU}9!tW#?p&P*F5AvNATY iGBvaG%u^`INL2{S&q+?q%V%QZcrNlD2sP0C4yu#@uhb5axY;!BE3(L~Y`bBdut#U(|V ZdFc=hd8Iiy5LQWMZfbE!Vr~J13jo^?C}#iw literal 0 HcmV?d00001 diff --git a/variant/object_primitive.value b/variant/object_primitive.value new file mode 100644 index 0000000000000000000000000000000000000000..eacd3d9460cb5159847e2ab6c485fef397935b5b GIT binary patch literal 66 zcmZQ#XJ%q#U}a(BU}UOm VV4`bi7Gh{*Wo%+)YNlsy0RT+741)jw literal 0 HcmV?d00001 diff --git a/variant/primitive_binary.json b/variant/primitive_binary.json new file mode 100644 index 0000000..d5009d8 --- /dev/null +++ b/variant/primitive_binary.json @@ -0,0 +1 @@ +"AxM33q2+78r+" \ No newline at end of file diff --git a/variant/primitive_binary.metadata b/variant/primitive_binary.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_binary.value b/variant/primitive_binary.value new file mode 100644 index 0000000000000000000000000000000000000000..a874b037887dfd7f87b70b35b753db81d8d8f024 GIT binary patch literal 14 VcmcEFWME)m7B;`PcHjF`{{R;21(N^( literal 0 HcmV?d00001 diff --git a/variant/primitive_boolean_false.json b/variant/primitive_boolean_false.json new file mode 100644 index 0000000..02e4a84 --- /dev/null +++ b/variant/primitive_boolean_false.json @@ -0,0 +1 @@ +false \ No newline at end of file diff --git a/variant/primitive_boolean_false.metadata b/variant/primitive_boolean_false.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_boolean_false.value b/variant/primitive_boolean_false.value new file mode 100644 index 0000000..5a77f05 --- /dev/null +++ b/variant/primitive_boolean_false.value @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/variant/primitive_boolean_true.json b/variant/primitive_boolean_true.json new file mode 100644 index 0000000..f32a580 --- /dev/null +++ b/variant/primitive_boolean_true.json @@ -0,0 +1 @@ +true \ No newline at end of file diff --git a/variant/primitive_boolean_true.metadata b/variant/primitive_boolean_true.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_boolean_true.value b/variant/primitive_boolean_true.value new file mode 100644 index 0000000..45a8ca0 --- /dev/null +++ b/variant/primitive_boolean_true.value @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/variant/primitive_date.json b/variant/primitive_date.json new file mode 100644 index 0000000..d1ed165 --- /dev/null +++ b/variant/primitive_date.json @@ -0,0 +1 @@ +"2025-04-16" \ No newline at end of file diff --git a/variant/primitive_date.metadata b/variant/primitive_date.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_date.value b/variant/primitive_date.value new file mode 100644 index 0000000000000000000000000000000000000000..bf2bb0747742c3862a1ea737774b6cfad01c8846 GIT binary patch literal 5 McmdOSt<8 literal 0 HcmV?d00001 diff --git a/variant/primitive_decimal16.json b/variant/primitive_decimal16.json new file mode 100644 index 0000000..121c171 --- /dev/null +++ b/variant/primitive_decimal16.json @@ -0,0 +1 @@ +12345678912345678.9 \ No newline at end of file diff --git a/variant/primitive_decimal16.metadata b/variant/primitive_decimal16.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_decimal16.value b/variant/primitive_decimal16.value new file mode 100644 index 0000000000000000000000000000000000000000..3b441af4fdeb306fc4d6c7358e2d2ef6b21a8230 GIT binary patch literal 18 UcmdO3y0lICz!w1}K?W!Q05pXI%>V!Z literal 0 HcmV?d00001 diff --git a/variant/primitive_decimal4.json b/variant/primitive_decimal4.json new file mode 100644 index 0000000..2e0c11f --- /dev/null +++ b/variant/primitive_decimal4.json @@ -0,0 +1 @@ +12.34 \ No newline at end of file diff --git a/variant/primitive_decimal4.metadata b/variant/primitive_decimal4.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_decimal4.value b/variant/primitive_decimal4.value new file mode 100644 index 0000000000000000000000000000000000000000..a8dc7f1d231ffe016a8b55ef2f10d97dc391d8a4 GIT binary patch literal 6 NcmY#jy2Qf3000Ce0Qmp_ literal 0 HcmV?d00001 diff --git a/variant/primitive_decimal8.json b/variant/primitive_decimal8.json new file mode 100644 index 0000000..07d7211 --- /dev/null +++ b/variant/primitive_decimal8.json @@ -0,0 +1 @@ +12345678.9 \ No newline at end of file diff --git a/variant/primitive_decimal8.metadata b/variant/primitive_decimal8.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_decimal8.value b/variant/primitive_decimal8.value new file mode 100644 index 0000000000000000000000000000000000000000..41744b5dd4b3f38974ee1ffa0eccaaf3a4532cf2 GIT binary patch literal 10 PcmY#ky2LcilK}_-4HE&{ literal 0 HcmV?d00001 diff --git a/variant/primitive_double.json b/variant/primitive_double.json new file mode 100644 index 0000000..21bfabf --- /dev/null +++ b/variant/primitive_double.json @@ -0,0 +1 @@ +1.2345678901234E9 \ No newline at end of file diff --git a/variant/primitive_double.metadata b/variant/primitive_double.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_double.value b/variant/primitive_double.value new file mode 100644 index 0000000..d8eedde --- /dev/null +++ b/variant/primitive_double.value @@ -0,0 +1 @@ +É凴€eÒA \ No newline at end of file diff --git a/variant/primitive_float.json b/variant/primitive_float.json new file mode 100644 index 0000000..7beed6c --- /dev/null +++ b/variant/primitive_float.json @@ -0,0 +1 @@ +1.23456794E9 \ No newline at end of file diff --git a/variant/primitive_float.metadata b/variant/primitive_float.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_float.value b/variant/primitive_float.value new file mode 100644 index 0000000..aeb162a --- /dev/null +++ b/variant/primitive_float.value @@ -0,0 +1 @@ +8,“N \ No newline at end of file diff --git a/variant/primitive_int16.json b/variant/primitive_int16.json new file mode 100644 index 0000000..274c005 --- /dev/null +++ b/variant/primitive_int16.json @@ -0,0 +1 @@ +1234 \ No newline at end of file diff --git a/variant/primitive_int16.metadata b/variant/primitive_int16.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_int16.value b/variant/primitive_int16.value new file mode 100644 index 0000000..ac699f7 --- /dev/null +++ b/variant/primitive_int16.value @@ -0,0 +1 @@ +Ò \ No newline at end of file diff --git a/variant/primitive_int32.json b/variant/primitive_int32.json new file mode 100644 index 0000000..4632e06 --- /dev/null +++ b/variant/primitive_int32.json @@ -0,0 +1 @@ +123456 \ No newline at end of file diff --git a/variant/primitive_int32.metadata b/variant/primitive_int32.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_int32.value b/variant/primitive_int32.value new file mode 100644 index 0000000000000000000000000000000000000000..f6ec47504f2bc50818e0fac46901ea51de95469b GIT binary patch literal 5 McmWe}c*Mv600a>MH~;_u literal 0 HcmV?d00001 diff --git a/variant/primitive_int64.json b/variant/primitive_int64.json new file mode 100644 index 0000000..e9a9ea1 --- /dev/null +++ b/variant/primitive_int64.json @@ -0,0 +1 @@ +12345678 \ No newline at end of file diff --git a/variant/primitive_int64.metadata b/variant/primitive_int64.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_int64.value b/variant/primitive_int64.value new file mode 100644 index 0000000000000000000000000000000000000000..098d0ec5eae2f8a91b995241f997dcfdd71c416a GIT binary patch literal 5 McmWgfOWeZ%00cY%fB*mh literal 0 HcmV?d00001 diff --git a/variant/primitive_int8.json b/variant/primitive_int8.json new file mode 100644 index 0000000..f70d7bb --- /dev/null +++ b/variant/primitive_int8.json @@ -0,0 +1 @@ +42 \ No newline at end of file diff --git a/variant/primitive_int8.metadata b/variant/primitive_int8.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_int8.value b/variant/primitive_int8.value new file mode 100644 index 0000000..77f9ec5 --- /dev/null +++ b/variant/primitive_int8.value @@ -0,0 +1 @@ + * \ No newline at end of file diff --git a/variant/primitive_null.json b/variant/primitive_null.json new file mode 100644 index 0000000..e69de29 diff --git a/variant/primitive_null.metadata b/variant/primitive_null.metadata new file mode 100644 index 0000000..e69de29 diff --git a/variant/primitive_null.value b/variant/primitive_null.value new file mode 100644 index 0000000..e69de29 diff --git a/variant/primitive_string.json b/variant/primitive_string.json new file mode 100644 index 0000000..00bc922 --- /dev/null +++ b/variant/primitive_string.json @@ -0,0 +1 @@ +"This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as ðŸ¢, 💖, ♥ï¸, 🎣 and 🤦!!" \ No newline at end of file diff --git a/variant/primitive_string.metadata b/variant/primitive_string.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_string.value b/variant/primitive_string.value new file mode 100644 index 0000000000000000000000000000000000000000..40da840e50466d139a015e987383a041b9b74c96 GIT binary patch literal 179 zcmXYqJr2S!3`TntckhUpIRra0iQ7bq63KRvxC9^uB&JIIjm(^f3(zCrA{NVj@3Xyl zR#mkOQ9zVq?!dZKaz_jrFvENX>#dRiW^aU;+QJ0B7|KPVjS4XXf`r1l`X8CXCNN1c z>r(G6F0>)WWNVfe7bCcUj82J7_AVHOaX4HWtH)CV@ALih+E3H&`ujf)w?|!n0jWPl AbN~PV literal 0 HcmV?d00001 diff --git a/variant/primitive_timestamp.json b/variant/primitive_timestamp.json new file mode 100644 index 0000000..fce2701 --- /dev/null +++ b/variant/primitive_timestamp.json @@ -0,0 +1 @@ +"2025-04-16 12:34:56.78-04:00" \ No newline at end of file diff --git a/variant/primitive_timestamp.metadata b/variant/primitive_timestamp.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_timestamp.value b/variant/primitive_timestamp.value new file mode 100644 index 0000000000000000000000000000000000000000..a489ecb4b0c62c0f0e49958562f5a141f839d293 GIT binary patch literal 9 QcmXqb5H$Vnb0ane02dGg_5c6? literal 0 HcmV?d00001 diff --git a/variant/primitive_timestampntz.json b/variant/primitive_timestampntz.json new file mode 100644 index 0000000..ff69f87 --- /dev/null +++ b/variant/primitive_timestampntz.json @@ -0,0 +1 @@ +"2025-04-16 12:34:56.78" \ No newline at end of file diff --git a/variant/primitive_timestampntz.metadata b/variant/primitive_timestampntz.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/primitive_timestampntz.value b/variant/primitive_timestampntz.value new file mode 100644 index 0000000000000000000000000000000000000000..47125dd7e1c3eae02d02ff5a977809390ff30989 GIT binary patch literal 9 QcmXqcaLA+ii4hwE02Z7Bz5oCK literal 0 HcmV?d00001 diff --git a/variant/regen.py b/variant/regen.py new file mode 100644 index 0000000..adc80bf --- /dev/null +++ b/variant/regen.py @@ -0,0 +1,156 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This program uses Apache Spark to generate example binary Variant data +# +# Requirements +# pip install pyarrow +# pip install pyspark +# +# Last run with Spark 4.0 preview 2: +# https://spark.apache.org/news/spark-4.0.0-preview2.html + +from pyspark.sql import SparkSession +import pyarrow.parquet as pq +import os + +# Initialize Spark session and create variant data via SQL +spark = SparkSession.builder \ + .appName("PySpark SQL Example") \ + .getOrCreate() + +# recursively cleanup the spark-warehouse directory +if os.path.exists('spark-warehouse'): + for root, dirs, files in os.walk('spark-warehouse', topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + + +# Create a table with variant and insert various types into it +# +# This writes data files into spark-warehouse/output +sql = """ +CREATE TABLE T (name VARCHAR(2000), variant_col VARIANT); + +------------------------------- +-- Primitive type (basic_type=0) +------------------------------- +-- One row with a value from each type listed in +-- https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types +-- +-- Spark Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html +-- Note: use explicit typecasts as spark returns an error for implicit casts +INSERT INTO T VALUES ('primitive_null', NULL); +INSERT INTO T VALUES ('primitive_boolean_true', true::Variant); +INSERT INTO T VALUES ('primitive_boolean_false', false::Variant); +INSERT INTO T VALUES ('primitive_int8', 42::Byte::Variant); +INSERT INTO T VALUES ('primitive_int16', 1234::Short::Variant); +INSERT INTO T VALUES ('primitive_int32', 123456::Integer::Variant); +INSERT INTO T VALUES ('primitive_int64', 12345678::Long::Variant); +INSERT INTO T VALUES ('primitive_double', 1234567890.1234::Double::Variant); +INSERT INTO T VALUES ('primitive_decimal4', 12.34::Decimal(8,2)::Variant); +INSERT INTO T VALUES ('primitive_decimal8', 12345678.90::Decimal(12,2)::Variant); +INSERT INTO T VALUES ('primitive_decimal16', 12345678912345678.90::Decimal(30,2)::Variant); +INSERT INTO T VALUES ('primitive_date', '2025-04-16'::Date::Variant); +INSERT INTO T VALUES ('primitive_timestamp', '2025-04-16T12:34:56.78'::Timestamp::Variant); +INSERT INTO T VALUES ('primitive_timestampntz', '2025-04-16T12:34:56.78'::Timestamp_NTZ::Variant); +INSERT INTO T VALUES ('primitive_float', 1234567890.1234::Float::Variant); +INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); +INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as ðŸ¢, 💖, ♥ï¸, 🎣 and 🤦!!'::Variant); +-- It is not clear how to create these types using Spark SQL +-- TODO TimeNTZ (Type ID 17) +-- TODO 'timestamp with timezone' (Type ID 18) +-- TODO 'timestamp with time zone' (Type ID 19) +-- TODO 'UUID' (Type ID 20) + +------------------------------- +-- Short string (basic_type=1) +------------------------------- +INSERT INTO T VALUES ('short_string', 'Less than 64 bytes (â¤ï¸ with utf8)'::Variant); + +------------------------------- +-- Object (basic_type=2) +------------------------------- +-- Use parse_json to create Variant, as spark does not seem to support casting structs --> Variant. +-- TODO create example variant objects with fields that have more specific types (like timestamp, date, etc) +-- cannot cast "STRUCT<...>" to "VARIANT"" +-- INSERT INTO T VALUES ('object_primitive', struct(1234.56::Double as double_field, true as boolean_true_field, false as boolean_false_field, '2025-04-16T12:34:56.78'::Timestamp as timestamp_field, 'Apache Parquet' as string_field, null as null_field)::Variant); +INSERT INTO T VALUES ('object_empty', parse_json('{}')::Variant); +INSERT INTO T VALUES ('object_primitive', parse_json('{"int_field" : 1, "double_field": 1.23456789, "boolean_true_field": true, "boolean_false_field": false, "string_field": "Apache Parquet", "null_field": null, "timestamp_field": "2025-04-16T12:34:56.78"}')::Variant); +INSERT INTO T VALUES ('object_nested', parse_json('{ "id" : 1, "species" : { "name": "lava monster", "population": 12345}, "observation" : { "time": "12:34:56", "location": "In the Volcano", "value" : { "temperature": 123, "humidity": 456 } } }')::Variant); + +--TODO objects with more than 2**8 distinct fields (that require using more than one byte for field offset) +--TODO objects with more than 2**16 distinct fields (that require using more than 2 bytes for field offset) +--TODO objects with more than 2**24 distinct fields (that require using more than 3 bytes for field offset) + +------------------------------- +-- Array (basic_type=3) +------------------------------- +INSERT INTO T VALUES ('array_empty', parse_json('[]')::Variant); +INSERT INTO T VALUES ('array_primitive', parse_json('[2, 1, 5, 9]')::Variant); +INSERT INTO T VALUES ('array_nested', parse_json('[ { "id": 1, "thing": { "names": ["Contrarian", "Spider"] } }, { "id": 2, "type": "if", "names": ["Apple", "Ray", null] } ]')::Variant); + +-- TODO arrays with more than 2**8 distinct elements (that require using more than one byte for count) +-- TODO arrays where the total length of all values is greater than 2**8, 2**16, and 2**24 bytes (requires using more than one byte for the offsets) + + +-- Copy the output to a new table that also has the JSON representation of the variant column +DROP TABLE IF EXISTS output; +CREATE TABLE output AS SELECT name, variant_col, to_json(variant_col) as json_col FROM T; +""" +for statement in sql.split("\n"): + statement = statement.strip() + if not statement or statement.startswith("--"): + continue + print("Running SQL:", statement) + spark.sql(statement) + +mypath = 'spark-warehouse/output' +parquet_files = [f for f in os.listdir(mypath) if f.endswith('.parquet')] + +print("Parquet files:", parquet_files) + +# extract the values from the parquet files +for f in parquet_files: + table = pq.read_table(os.path.join(mypath, f)) + for row in range(len(table)): + name = table[0][row] + # variants are stored as StructArrays with two fields: + # metadata, and value + variant_col = table[1][row] + metadata = variant_col['metadata'] + value = variant_col['value'] + json_value = table[2][row] + + print("Writing metadata for", name) + + # write the metadata, value, and json representation to files + with open(f"{name}.metadata", "wb") as f: + buffer = metadata.as_buffer() + if buffer is not None: + f.write(buffer) + with open(f"{name}.value", "wb") as f: + buffer = value.as_buffer() + if buffer is not None: + f.write(buffer) + with open(f"{name}.json", "wb") as f: + buffer = json_value.as_buffer() + if buffer is not None: + f.write(buffer) + diff --git a/variant/short_string.json b/variant/short_string.json new file mode 100644 index 0000000..47d8c95 --- /dev/null +++ b/variant/short_string.json @@ -0,0 +1 @@ +"Less than 64 bytes (â¤ï¸ with utf8)" \ No newline at end of file diff --git a/variant/short_string.metadata b/variant/short_string.metadata new file mode 100644 index 0000000000000000000000000000000000000000..12db4781e63a8c821478a5af5c840908f228181d GIT binary patch literal 3 KcmZQ%U;qFB1^@y8 literal 0 HcmV?d00001 diff --git a/variant/short_string.value b/variant/short_string.value new file mode 100644 index 0000000..c403fe2 --- /dev/null +++ b/variant/short_string.value @@ -0,0 +1 @@ +•Less than 64 bytes (â¤ï¸ with utf8) \ No newline at end of file From b1996369c3cb5a8ef43b821e1d4cf4a8039f4be5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 16 Apr 2025 10:54:10 -0400 Subject: [PATCH 2/7] Inline json representation --- variant/README.md | 5 +- variant/array_empty.json | 1 - variant/array_nested.json | 1 - variant/array_primitive.json | 1 - variant/data_dictionary.json | 72 ++++++++++++++++++++++++++++ variant/long_string.json | 1 - variant/object_empty.json | 1 - variant/object_nested.json | 1 - variant/object_primitive.json | 1 - variant/primitive_binary.json | 1 - variant/primitive_boolean_false.json | 1 - variant/primitive_boolean_true.json | 1 - variant/primitive_date.json | 1 - variant/primitive_decimal16.json | 1 - variant/primitive_decimal4.json | 1 - variant/primitive_decimal8.json | 1 - variant/primitive_double.json | 1 - variant/primitive_float.json | 1 - variant/primitive_int16.json | 1 - variant/primitive_int32.json | 1 - variant/primitive_int64.json | 1 - variant/primitive_int8.json | 1 - variant/primitive_null.json | 0 variant/primitive_string.json | 1 - variant/primitive_timestamp.json | 1 - variant/primitive_timestampntz.json | 1 - variant/regen.py | 20 +++++--- variant/short_string.json | 1 - 28 files changed, 89 insertions(+), 32 deletions(-) delete mode 100644 variant/array_empty.json delete mode 100644 variant/array_nested.json delete mode 100644 variant/array_primitive.json create mode 100644 variant/data_dictionary.json delete mode 100644 variant/long_string.json delete mode 100644 variant/object_empty.json delete mode 100644 variant/object_nested.json delete mode 100644 variant/object_primitive.json delete mode 100644 variant/primitive_binary.json delete mode 100644 variant/primitive_boolean_false.json delete mode 100644 variant/primitive_boolean_true.json delete mode 100644 variant/primitive_date.json delete mode 100644 variant/primitive_decimal16.json delete mode 100644 variant/primitive_decimal4.json delete mode 100644 variant/primitive_decimal8.json delete mode 100644 variant/primitive_double.json delete mode 100644 variant/primitive_float.json delete mode 100644 variant/primitive_int16.json delete mode 100644 variant/primitive_int32.json delete mode 100644 variant/primitive_int64.json delete mode 100644 variant/primitive_int8.json delete mode 100644 variant/primitive_null.json delete mode 100644 variant/primitive_string.json delete mode 100644 variant/primitive_timestamp.json delete mode 100644 variant/primitive_timestampntz.json delete mode 100644 variant/short_string.json diff --git a/variant/README.md b/variant/README.md index 15f8482..ce8fce8 100644 --- a/variant/README.md +++ b/variant/README.md @@ -25,11 +25,12 @@ raw binary data. ## Structure -Each example consists of three files: +* `data_dictionary.json` - contains the JSON representation of each of the examples + +Each example consists of 2 files: * `.metadata` -- the binary contents of the `metadata` field * `.value` -- the binary contents of the `value` field -* `.json` -- the equivalent JSON ## Descriptions diff --git a/variant/array_empty.json b/variant/array_empty.json deleted file mode 100644 index 0637a08..0000000 --- a/variant/array_empty.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/variant/array_nested.json b/variant/array_nested.json deleted file mode 100644 index 761b622..0000000 --- a/variant/array_nested.json +++ /dev/null @@ -1 +0,0 @@ -[{"id":1,"thing":{"names":["Contrarian","Spider"]}},{"id":2,"names":["Apple","Ray",null],"type":"if"}] \ No newline at end of file diff --git a/variant/array_primitive.json b/variant/array_primitive.json deleted file mode 100644 index 6a57c5c..0000000 --- a/variant/array_primitive.json +++ /dev/null @@ -1 +0,0 @@ -[2,1,5,9] \ No newline at end of file diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json new file mode 100644 index 0000000..16c71aa --- /dev/null +++ b/variant/data_dictionary.json @@ -0,0 +1,72 @@ +{ + "array_primitive": [ + 2, + 1, + 5, + 9 + ], + "primitive_binary": "AxM33q2+78r+", + "primitive_timestampntz": "2025-04-16 12:34:56.78", + "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", + "primitive_float": 1234567940.0, + "primitive_int32": 123456, + "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", + "primitive_decimal16": 1.2345678912345678e+16, + "primitive_int64": 12345678, + "primitive_int16": 1234, + "primitive_decimal8": 12345678.9, + "primitive_double": 1234567890.1234, + "object_primitive": { + "boolean_false_field": false, + "boolean_true_field": true, + "double_field": 1.23456789, + "int_field": 1, + "null_field": null, + "string_field": "Apache Parquet", + "timestamp_field": "2025-04-16T12:34:56.78" + }, + "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", + "primitive_boolean_true": true, + "primitive_date": "2025-04-16", + "primitive_int8": 42, + "object_empty": {}, + "primitive_boolean_false": false, + "primitive_decimal4": 12.34, + "array_empty": [], + "primitive_null": null, + "object_nested": { + "id": 1, + "observation": { + "location": "In the Volcano", + "time": "12:34:56", + "value": { + "humidity": 456, + "temperature": 123 + } + }, + "species": { + "name": "lava monster", + "population": 12345 + } + }, + "array_nested": [ + { + "id": 1, + "thing": { + "names": [ + "Contrarian", + "Spider" + ] + } + }, + { + "id": 2, + "names": [ + "Apple", + "Ray", + null + ], + "type": "if" + } + ] +} \ No newline at end of file diff --git a/variant/long_string.json b/variant/long_string.json deleted file mode 100644 index 1f9f755..0000000 --- a/variant/long_string.json +++ /dev/null @@ -1 +0,0 @@ -"This string is for sure and certainly longer than 64 bytes and it also includes several non ascii characters such as ðŸ¢, 💖, ♥ï¸, 🎣 and 🤦!!" \ No newline at end of file diff --git a/variant/object_empty.json b/variant/object_empty.json deleted file mode 100644 index 9e26dfe..0000000 --- a/variant/object_empty.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/variant/object_nested.json b/variant/object_nested.json deleted file mode 100644 index 0e9768b..0000000 --- a/variant/object_nested.json +++ /dev/null @@ -1 +0,0 @@ -{"id":1,"observation":{"location":"In the Volcano","time":"12:34:56","value":{"humidity":456,"temperature":123}},"species":{"name":"lava monster","population":12345}} \ No newline at end of file diff --git a/variant/object_primitive.json b/variant/object_primitive.json deleted file mode 100644 index 117c0ed..0000000 --- a/variant/object_primitive.json +++ /dev/null @@ -1 +0,0 @@ -{"boolean_false_field":false,"boolean_true_field":true,"double_field":1.23456789,"int_field":1,"null_field":null,"string_field":"Apache Parquet","timestamp_field":"2025-04-16T12:34:56.78"} \ No newline at end of file diff --git a/variant/primitive_binary.json b/variant/primitive_binary.json deleted file mode 100644 index d5009d8..0000000 --- a/variant/primitive_binary.json +++ /dev/null @@ -1 +0,0 @@ -"AxM33q2+78r+" \ No newline at end of file diff --git a/variant/primitive_boolean_false.json b/variant/primitive_boolean_false.json deleted file mode 100644 index 02e4a84..0000000 --- a/variant/primitive_boolean_false.json +++ /dev/null @@ -1 +0,0 @@ -false \ No newline at end of file diff --git a/variant/primitive_boolean_true.json b/variant/primitive_boolean_true.json deleted file mode 100644 index f32a580..0000000 --- a/variant/primitive_boolean_true.json +++ /dev/null @@ -1 +0,0 @@ -true \ No newline at end of file diff --git a/variant/primitive_date.json b/variant/primitive_date.json deleted file mode 100644 index d1ed165..0000000 --- a/variant/primitive_date.json +++ /dev/null @@ -1 +0,0 @@ -"2025-04-16" \ No newline at end of file diff --git a/variant/primitive_decimal16.json b/variant/primitive_decimal16.json deleted file mode 100644 index 121c171..0000000 --- a/variant/primitive_decimal16.json +++ /dev/null @@ -1 +0,0 @@ -12345678912345678.9 \ No newline at end of file diff --git a/variant/primitive_decimal4.json b/variant/primitive_decimal4.json deleted file mode 100644 index 2e0c11f..0000000 --- a/variant/primitive_decimal4.json +++ /dev/null @@ -1 +0,0 @@ -12.34 \ No newline at end of file diff --git a/variant/primitive_decimal8.json b/variant/primitive_decimal8.json deleted file mode 100644 index 07d7211..0000000 --- a/variant/primitive_decimal8.json +++ /dev/null @@ -1 +0,0 @@ -12345678.9 \ No newline at end of file diff --git a/variant/primitive_double.json b/variant/primitive_double.json deleted file mode 100644 index 21bfabf..0000000 --- a/variant/primitive_double.json +++ /dev/null @@ -1 +0,0 @@ -1.2345678901234E9 \ No newline at end of file diff --git a/variant/primitive_float.json b/variant/primitive_float.json deleted file mode 100644 index 7beed6c..0000000 --- a/variant/primitive_float.json +++ /dev/null @@ -1 +0,0 @@ -1.23456794E9 \ No newline at end of file diff --git a/variant/primitive_int16.json b/variant/primitive_int16.json deleted file mode 100644 index 274c005..0000000 --- a/variant/primitive_int16.json +++ /dev/null @@ -1 +0,0 @@ -1234 \ No newline at end of file diff --git a/variant/primitive_int32.json b/variant/primitive_int32.json deleted file mode 100644 index 4632e06..0000000 --- a/variant/primitive_int32.json +++ /dev/null @@ -1 +0,0 @@ -123456 \ No newline at end of file diff --git a/variant/primitive_int64.json b/variant/primitive_int64.json deleted file mode 100644 index e9a9ea1..0000000 --- a/variant/primitive_int64.json +++ /dev/null @@ -1 +0,0 @@ -12345678 \ No newline at end of file diff --git a/variant/primitive_int8.json b/variant/primitive_int8.json deleted file mode 100644 index f70d7bb..0000000 --- a/variant/primitive_int8.json +++ /dev/null @@ -1 +0,0 @@ -42 \ No newline at end of file diff --git a/variant/primitive_null.json b/variant/primitive_null.json deleted file mode 100644 index e69de29..0000000 diff --git a/variant/primitive_string.json b/variant/primitive_string.json deleted file mode 100644 index 00bc922..0000000 --- a/variant/primitive_string.json +++ /dev/null @@ -1 +0,0 @@ -"This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as ðŸ¢, 💖, ♥ï¸, 🎣 and 🤦!!" \ No newline at end of file diff --git a/variant/primitive_timestamp.json b/variant/primitive_timestamp.json deleted file mode 100644 index fce2701..0000000 --- a/variant/primitive_timestamp.json +++ /dev/null @@ -1 +0,0 @@ -"2025-04-16 12:34:56.78-04:00" \ No newline at end of file diff --git a/variant/primitive_timestampntz.json b/variant/primitive_timestampntz.json deleted file mode 100644 index ff69f87..0000000 --- a/variant/primitive_timestampntz.json +++ /dev/null @@ -1 +0,0 @@ -"2025-04-16 12:34:56.78" \ No newline at end of file diff --git a/variant/regen.py b/variant/regen.py index adc80bf..bfa19bc 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -27,6 +27,7 @@ from pyspark.sql import SparkSession import pyarrow.parquet as pq import os +import json # Initialize Spark session and create variant data via SQL spark = SparkSession.builder \ @@ -124,9 +125,8 @@ mypath = 'spark-warehouse/output' parquet_files = [f for f in os.listdir(mypath) if f.endswith('.parquet')] -print("Parquet files:", parquet_files) - # extract the values from the parquet files +data_dictionary = {} for f in parquet_files: table = pq.read_table(os.path.join(mypath, f)) for row in range(len(table)): @@ -149,8 +149,16 @@ buffer = value.as_buffer() if buffer is not None: f.write(buffer) - with open(f"{name}.json", "wb") as f: - buffer = json_value.as_buffer() - if buffer is not None: - f.write(buffer) + + # Add the JSON representation to the data dictionary + name = name.as_py() + json_value = json_value.as_py() + + if json_value is not None: + data_dictionary[name] = json.loads(json_value) + else: + data_dictionary[name] = None + +with open(f"data_dictionary.json", "w") as f: + f.write(json.dumps(data_dictionary, indent=4)) diff --git a/variant/short_string.json b/variant/short_string.json deleted file mode 100644 index 47d8c95..0000000 --- a/variant/short_string.json +++ /dev/null @@ -1 +0,0 @@ -"Less than 64 bytes (â¤ï¸ with utf8)" \ No newline at end of file From 444ccfd67846c9bbc8b982e4f5c02d8cc5c562d1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 May 2025 08:28:02 -0400 Subject: [PATCH 3/7] Improve readme --- .gitignore | 2 -- README.md | 1 - variant/.gitignore | 3 --- variant/README.md | 12 ++++-------- 4 files changed, 4 insertions(+), 14 deletions(-) delete mode 100644 .gitignore delete mode 100644 variant/.gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 56e80f4..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.idea -variant/derby.log \ No newline at end of file diff --git a/README.md b/README.md index fe74318..07fd928 100644 --- a/README.md +++ b/README.md @@ -22,5 +22,4 @@ - [bad_data](bad_data/README.md) - Reproducers for bad data files for testing - [variant](variant/README.md) - Sample [Variant] binary values - [Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md \ No newline at end of file diff --git a/variant/.gitignore b/variant/.gitignore deleted file mode 100644 index 1e03a04..0000000 --- a/variant/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -derby.log -spark-warehouse -metastore_db \ No newline at end of file diff --git a/variant/README.md b/variant/README.md index ce8fce8..71e4da8 100644 --- a/variant/README.md +++ b/variant/README.md @@ -19,13 +19,13 @@ # Variant Binary Encoding -This directory contains binary artifacts encoded using the Parquet [Variant] +This directory contains binary artifacts encoded using the Parquet [Variant] binary encoding. These files are **not** valid Parquet files, but rather -raw binary data. +raw binary data. ## Structure -* `data_dictionary.json` - contains the JSON representation of each of the examples +* `data_dictionary.json` - contains the JSON representation for each example Each example consists of 2 files: @@ -43,14 +43,10 @@ Each example consists of 2 files: 5. `array_primitive` -- Example of array with only primitive elements 6. `array_nested` -- Example of an with objects and other arrays in the elements - ## Regenerating these files The files were generated by running the [`regen.py`](regen.py) script that uses Apache Spark to -generate the files. - - - +generate the files. [Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md [primitive types listed in the spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-primitive-type-basic_type0 From 8c989a85d3e2fbb91cc7e3d5e1110cd2721498d8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 May 2025 08:54:47 -0400 Subject: [PATCH 4/7] Add null at top level of nested struct, improve comments --- variant/array_nested.value | Bin 73 -> 75 bytes variant/data_dictionary.json | 97 ++++++++++++++++++----------------- variant/regen.py | 26 ++++++---- 3 files changed, 65 insertions(+), 58 deletions(-) diff --git a/variant/array_nested.value b/variant/array_nested.value index 5985b7c5c8737ae5f396173895c31a00ae220ebf..e7bd59a764fc1021967d1b2844af333bb5a26b99 100644 GIT binary patch delta 17 YcmebDW@Bb%P*roC$g0Z7Fi}e$02g@z#{d8T delta 14 VcmebFWMyVzP<5HeraDna9sm)P0-yi@ diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json index 16c71aa..f3f96b8 100644 --- a/variant/data_dictionary.json +++ b/variant/data_dictionary.json @@ -1,39 +1,33 @@ { + "array_empty": [], + "array_nested": [ + { + "id": 1, + "thing": { + "names": [ + "Contrarian", + "Spider" + ] + } + }, + null, + { + "id": 2, + "names": [ + "Apple", + "Ray", + null + ], + "type": "if" + } + ], "array_primitive": [ 2, 1, 5, 9 ], - "primitive_binary": "AxM33q2+78r+", - "primitive_timestampntz": "2025-04-16 12:34:56.78", - "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", - "primitive_float": 1234567940.0, - "primitive_int32": 123456, - "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)", - "primitive_decimal16": 1.2345678912345678e+16, - "primitive_int64": 12345678, - "primitive_int16": 1234, - "primitive_decimal8": 12345678.9, - "primitive_double": 1234567890.1234, - "object_primitive": { - "boolean_false_field": false, - "boolean_true_field": true, - "double_field": 1.23456789, - "int_field": 1, - "null_field": null, - "string_field": "Apache Parquet", - "timestamp_field": "2025-04-16T12:34:56.78" - }, - "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", - "primitive_boolean_true": true, - "primitive_date": "2025-04-16", - "primitive_int8": 42, "object_empty": {}, - "primitive_boolean_false": false, - "primitive_decimal4": 12.34, - "array_empty": [], - "primitive_null": null, "object_nested": { "id": 1, "observation": { @@ -49,24 +43,31 @@ "population": 12345 } }, - "array_nested": [ - { - "id": 1, - "thing": { - "names": [ - "Contrarian", - "Spider" - ] - } - }, - { - "id": 2, - "names": [ - "Apple", - "Ray", - null - ], - "type": "if" - } - ] + "object_primitive": { + "boolean_false_field": false, + "boolean_true_field": true, + "double_field": 1.23456789, + "int_field": 1, + "null_field": null, + "string_field": "Apache Parquet", + "timestamp_field": "2025-04-16T12:34:56.78" + }, + "primitive_binary": "AxM33q2+78r+", + "primitive_boolean_false": false, + "primitive_boolean_true": true, + "primitive_date": "2025-04-16", + "primitive_decimal16": 1.2345678912345678e+16, + "primitive_decimal4": 12.34, + "primitive_decimal8": 12345678.9, + "primitive_double": 1234567890.1234, + "primitive_float": 1234567940.0, + "primitive_int16": 1234, + "primitive_int32": 123456, + "primitive_int64": 12345678, + "primitive_int8": 42, + "primitive_null": null, + "primitive_string": "This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and \ud83e\udd26!!", + "primitive_timestamp": "2025-04-16 12:34:56.78-04:00", + "primitive_timestampntz": "2025-04-16 12:34:56.78", + "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)" } \ No newline at end of file diff --git a/variant/regen.py b/variant/regen.py index bfa19bc..4ece404 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -56,7 +56,7 @@ -- https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types -- -- Spark Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html --- Note: use explicit typecasts as spark returns an error for implicit casts +-- Note: must use explicit typecasts as Ppark returns an error for implicit casts INSERT INTO T VALUES ('primitive_null', NULL); INSERT INTO T VALUES ('primitive_boolean_true', true::Variant); INSERT INTO T VALUES ('primitive_boolean_false', false::Variant); @@ -74,7 +74,8 @@ INSERT INTO T VALUES ('primitive_float', 1234567890.1234::Float::Variant); INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as ðŸ¢, 💖, ♥ï¸, 🎣 and 🤦!!'::Variant); --- It is not clear how to create these types using Spark SQL + +-- TODO is not clear how to create the following types using Spark SQL -- TODO TimeNTZ (Type ID 17) -- TODO 'timestamp with timezone' (Type ID 18) -- TODO 'timestamp with time zone' (Type ID 19) @@ -89,13 +90,13 @@ -- Object (basic_type=2) ------------------------------- -- Use parse_json to create Variant, as spark does not seem to support casting structs --> Variant. --- TODO create example variant objects with fields that have more specific types (like timestamp, date, etc) --- cannot cast "STRUCT<...>" to "VARIANT"" --- INSERT INTO T VALUES ('object_primitive', struct(1234.56::Double as double_field, true as boolean_true_field, false as boolean_false_field, '2025-04-16T12:34:56.78'::Timestamp as timestamp_field, 'Apache Parquet' as string_field, null as null_field)::Variant); INSERT INTO T VALUES ('object_empty', parse_json('{}')::Variant); INSERT INTO T VALUES ('object_primitive', parse_json('{"int_field" : 1, "double_field": 1.23456789, "boolean_true_field": true, "boolean_false_field": false, "string_field": "Apache Parquet", "null_field": null, "timestamp_field": "2025-04-16T12:34:56.78"}')::Variant); INSERT INTO T VALUES ('object_nested', parse_json('{ "id" : 1, "species" : { "name": "lava monster", "population": 12345}, "observation" : { "time": "12:34:56", "location": "In the Volcano", "value" : { "temperature": 123, "humidity": 456 } } }')::Variant); +-- TODO create example variant objects with fields that non-json types (like timestamp, date, etc) +-- Casting from "STRUCT<...>" to "VARIANT"" is not yet supported +-- INSERT INTO T VALUES ('object_primitive', struct(1234.56::Double as double_field, true as boolean_true_field, false as boolean_false_field, '2025-04-16T12:34:56.78'::Timestamp as timestamp_field, 'Apache Parquet' as string_field, null as null_field)::Variant); --TODO objects with more than 2**8 distinct fields (that require using more than one byte for field offset) --TODO objects with more than 2**16 distinct fields (that require using more than 2 bytes for field offset) --TODO objects with more than 2**24 distinct fields (that require using more than 3 bytes for field offset) @@ -105,13 +106,14 @@ ------------------------------- INSERT INTO T VALUES ('array_empty', parse_json('[]')::Variant); INSERT INTO T VALUES ('array_primitive', parse_json('[2, 1, 5, 9]')::Variant); -INSERT INTO T VALUES ('array_nested', parse_json('[ { "id": 1, "thing": { "names": ["Contrarian", "Spider"] } }, { "id": 2, "type": "if", "names": ["Apple", "Ray", null] } ]')::Variant); +INSERT INTO T VALUES ('array_nested', parse_json('[ { "id": 1, "thing": { "names": ["Contrarian", "Spider"] } }, null, { "id": 2, "type": "if", "names": ["Apple", "Ray", null] } ]')::Variant); -- TODO arrays with more than 2**8 distinct elements (that require using more than one byte for count) --- TODO arrays where the total length of all values is greater than 2**8, 2**16, and 2**24 bytes (requires using more than one byte for the offsets) +-- TODO arrays where the total length of all values is greater than 2**8, 2**16, and 2**24 bytes (that require using more than one byte for the offsets) - --- Copy the output to a new table that also has the JSON representation of the variant column +------------------------------- +-- Output the value to a new table that also has the JSON representation of the variant column +------------------------------- DROP TABLE IF EXISTS output; CREATE TABLE output AS SELECT name, variant_col, to_json(variant_col) as json_col FROM T; """ @@ -160,5 +162,9 @@ data_dictionary[name] = None with open(f"data_dictionary.json", "w") as f: - f.write(json.dumps(data_dictionary, indent=4)) + f.write(json.dumps(data_dictionary, sort_keys = True, indent=4)) +# Note: It is possible to write the output to a single parquet file, using a command +# such as: +# spark.sql("SELECT * FROM output").repartition(1).write.parquet('variant.parquet') +# At the time of writing, this file does not have the logical type annotation for VARIANT \ No newline at end of file From 56695a4195f7638e8e98856d28cd95b79ecf9e4d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 May 2025 08:59:38 -0400 Subject: [PATCH 5/7] Use different value for embedded field for clarity --- variant/data_dictionary.json | 2 +- variant/object_nested.value | Bin 79 -> 79 bytes variant/regen.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json index f3f96b8..02361b7 100644 --- a/variant/data_dictionary.json +++ b/variant/data_dictionary.json @@ -40,7 +40,7 @@ }, "species": { "name": "lava monster", - "population": 12345 + "population": 6789 } }, "object_primitive": { diff --git a/variant/object_nested.value b/variant/object_nested.value index f6385a6fb6d0f614505c0ffe4b83a6568561b23d..7d05f4b17b1681e34914eb2671eb6cdea4c4557e 100644 GIT binary patch delta 10 RcmebGpP<0hDm76_7XT1=0?+^e delta 10 RcmebGpP;~GX)sYq7XS}b0-68- diff --git a/variant/regen.py b/variant/regen.py index 4ece404..2d39b4f 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -92,7 +92,7 @@ -- Use parse_json to create Variant, as spark does not seem to support casting structs --> Variant. INSERT INTO T VALUES ('object_empty', parse_json('{}')::Variant); INSERT INTO T VALUES ('object_primitive', parse_json('{"int_field" : 1, "double_field": 1.23456789, "boolean_true_field": true, "boolean_false_field": false, "string_field": "Apache Parquet", "null_field": null, "timestamp_field": "2025-04-16T12:34:56.78"}')::Variant); -INSERT INTO T VALUES ('object_nested', parse_json('{ "id" : 1, "species" : { "name": "lava monster", "population": 12345}, "observation" : { "time": "12:34:56", "location": "In the Volcano", "value" : { "temperature": 123, "humidity": 456 } } }')::Variant); +INSERT INTO T VALUES ('object_nested', parse_json('{ "id" : 1, "species" : { "name": "lava monster", "population": 6789}, "observation" : { "time": "12:34:56", "location": "In the Volcano", "value" : { "temperature": 123, "humidity": 456 } } }')::Variant); -- TODO create example variant objects with fields that non-json types (like timestamp, date, etc) -- Casting from "STRUCT<...>" to "VARIANT"" is not yet supported From 61fc409c2734076c69a01013a769ef38c1ff80e9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 May 2025 14:15:36 -0400 Subject: [PATCH 6/7] Add ticket links --- variant/regen.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/variant/regen.py b/variant/regen.py index 2d39b4f..0fc578b 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -75,7 +75,8 @@ INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant); INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as ðŸ¢, 💖, ♥ï¸, 🎣 and 🤦!!'::Variant); --- TODO is not clear how to create the following types using Spark SQL +-- https://github.com/apache/parquet-testing/issues/79 +-- is not clear how to create the following types using Spark SQL -- TODO TimeNTZ (Type ID 17) -- TODO 'timestamp with timezone' (Type ID 18) -- TODO 'timestamp with time zone' (Type ID 19) @@ -94,6 +95,7 @@ INSERT INTO T VALUES ('object_primitive', parse_json('{"int_field" : 1, "double_field": 1.23456789, "boolean_true_field": true, "boolean_false_field": false, "string_field": "Apache Parquet", "null_field": null, "timestamp_field": "2025-04-16T12:34:56.78"}')::Variant); INSERT INTO T VALUES ('object_nested', parse_json('{ "id" : 1, "species" : { "name": "lava monster", "population": 6789}, "observation" : { "time": "12:34:56", "location": "In the Volcano", "value" : { "temperature": 123, "humidity": 456 } } }')::Variant); +-- https://github.com/apache/parquet-testing/issues/77 -- TODO create example variant objects with fields that non-json types (like timestamp, date, etc) -- Casting from "STRUCT<...>" to "VARIANT"" is not yet supported -- INSERT INTO T VALUES ('object_primitive', struct(1234.56::Double as double_field, true as boolean_true_field, false as boolean_false_field, '2025-04-16T12:34:56.78'::Timestamp as timestamp_field, 'Apache Parquet' as string_field, null as null_field)::Variant); @@ -108,6 +110,7 @@ INSERT INTO T VALUES ('array_primitive', parse_json('[2, 1, 5, 9]')::Variant); INSERT INTO T VALUES ('array_nested', parse_json('[ { "id": 1, "thing": { "names": ["Contrarian", "Spider"] } }, null, { "id": 2, "type": "if", "names": ["Apple", "Ray", null] } ]')::Variant); +-- https://github.com/apache/parquet-testing/issues/78 -- TODO arrays with more than 2**8 distinct elements (that require using more than one byte for count) -- TODO arrays where the total length of all values is greater than 2**8, 2**16, and 2**24 bytes (that require using more than one byte for the offsets) From 10ca28971c1525d30e87cd4a7e5e5f9b2ec93696 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 May 2025 17:06:38 -0400 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Russell Spitzer --- variant/regen.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/variant/regen.py b/variant/regen.py index 0fc578b..ae9cb28 100644 --- a/variant/regen.py +++ b/variant/regen.py @@ -56,7 +56,7 @@ -- https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types -- -- Spark Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html --- Note: must use explicit typecasts as Ppark returns an error for implicit casts +-- Note: must use explicit typecasts as Spark returns an error for implicit casts INSERT INTO T VALUES ('primitive_null', NULL); INSERT INTO T VALUES ('primitive_boolean_true', true::Variant); INSERT INTO T VALUES ('primitive_boolean_false', false::Variant); @@ -78,8 +78,8 @@ -- https://github.com/apache/parquet-testing/issues/79 -- is not clear how to create the following types using Spark SQL -- TODO TimeNTZ (Type ID 17) --- TODO 'timestamp with timezone' (Type ID 18) --- TODO 'timestamp with time zone' (Type ID 19) +-- TODO 'timestamp with timezone (NANOS)' (Type ID 18) +-- TODO 'timestamp with time zone (NANOS)' (Type ID 19) -- TODO 'UUID' (Type ID 20) -------------------------------