Skip to content

Commit

Permalink
Merge pull request #8262 from rouault/fix_8227
Browse files Browse the repository at this point in the history
Arrow/Parquet: support/reading nested list/map datatypes as JSON (fixes #8227)
  • Loading branch information
rouault authored Aug 24, 2023
2 parents fe91cfa + 16ea5ec commit bc2e42f
Show file tree
Hide file tree
Showing 9 changed files with 880 additions and 133 deletions.
2 changes: 1 addition & 1 deletion apps/ogrinfo_lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1416,7 +1416,7 @@ static void ReportOnLayer(CPLString &osRet, CPLJSONObject oLayer,
}
}
else
oGeometries.AddNull("geometry");
oGeometries.AddNull();
}
}
}
Expand Down
49 changes: 47 additions & 2 deletions autotest/cpp/test_cpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2585,6 +2585,50 @@ TEST_F(test_cpl, CPLJSONDocument)
ASSERT_TRUE(!oDocument.Save("/i_do/not/exist"));
CPLPopErrorHandler();
}
{
CPLJSONObject oObj(nullptr);
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::Null);
}
{
CPLJSONObject oObj(true);
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::Boolean);
EXPECT_EQ(oObj.ToBool(), true);
}
{
CPLJSONObject oObj(1);
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::Integer);
EXPECT_EQ(oObj.ToInteger(), 1);
}
{
CPLJSONObject oObj(static_cast<int64_t>(123) * 1024 * 1024 * 1024);
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::Long);
EXPECT_EQ(oObj.ToLong(),
static_cast<int64_t>(123) * 1024 * 1024 * 1024);
}
{
CPLJSONObject oObj(static_cast<uint64_t>(123) * 1024 * 1024 * 1024);
// Might be a string with older libjson versons
if (oObj.GetType() == CPLJSONObject::Type::Long)
{
EXPECT_EQ(oObj.ToLong(),
static_cast<int64_t>(123) * 1024 * 1024 * 1024);
}
}
{
CPLJSONObject oObj(1.5);
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::Double);
EXPECT_EQ(oObj.ToDouble(), 1.5);
}
{
CPLJSONObject oObj("ab");
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::String);
EXPECT_STREQ(oObj.ToString().c_str(), "ab");
}
{
CPLJSONObject oObj(std::string("ab"));
EXPECT_EQ(oObj.GetType(), CPLJSONObject::Type::String);
EXPECT_STREQ(oObj.ToString().c_str(), "ab");
}
{
CPLJSONObject oObj;
oObj.Add("string", std::string("my_string"));
Expand Down Expand Up @@ -2659,7 +2703,8 @@ TEST_F(test_cpl, CPLJSONDocument)
oArray.Add(1);
oArray.Add(GINT64_MAX);
oArray.Add(true);
ASSERT_EQ(oArray.Size(), 7);
oArray.AddNull();
ASSERT_EQ(oArray.Size(), 8);

int nCount = 0;
for (const auto &obj : oArray)
Expand All @@ -2668,7 +2713,7 @@ TEST_F(test_cpl, CPLJSONDocument)
oArray[nCount].GetInternalHandle());
nCount++;
}
ASSERT_EQ(nCount, 7);
ASSERT_EQ(nCount, 8);
}
{
CPLJSONDocument oDocument;
Expand Down
250 changes: 250 additions & 0 deletions autotest/generate_parquet_test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,257 @@ def generate_parquet_wkt_with_dict():
)


def generate_nested_types():
import pathlib

import pyarrow as pa
import pyarrow.parquet as pq

map_list_bool = pa.array(
[
[("x", [True]), ("y", [False, True])],
[("z", [])],
None,
[("w", [True, False])],
[("null", None)],
],
type=pa.map_(pa.string(), pa.list_(pa.bool_())),
)

map_list_uint8 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.uint8())),
)

map_list_int8 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.int8())),
)

map_list_uint16 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.uint16())),
)

map_list_int16 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.int16())),
)

map_list_uint32 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.uint32())),
)

map_list_int32 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.int32())),
)

map_list_uint64 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.uint64())),
)

map_list_int64 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.int64())),
)

map_list_float32 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.float32())),
)

map_list_float64 = pa.array(
[[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []],
type=pa.map_(pa.string(), pa.list_(pa.float64())),
)

map_map_bool = pa.array(
[
[("a", [("b", True), ("c", None), ("d", None)]), ("e", None)],
None,
[("f", [("g", False)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.bool_())),
)

map_map_uint8 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint8())),
)

map_map_int8 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int8())),
)

map_map_uint16 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint16())),
)

map_map_int16 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int16())),
)

map_map_uint32 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint32())),
)

map_map_int32 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int32())),
)

map_map_uint64 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint64())),
)

map_map_int64 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int64())),
)

map_map_float32 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.float32())),
)

map_map_float64 = pa.array(
[
[("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)],
None,
[("f", [("g", 3)])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.float64())),
)

map_map_string = pa.array(
[
[("a", [("b", "c"), ("d", None)]), ("e", None)],
None,
[("f", [("g", "h")])],
None,
None,
],
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.string())),
)

list_list_string = pa.array(
[[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []],
type=pa.list_(pa.list_(pa.string())),
)

list_map_string = pa.array(
[[[("a", "b"), ("c", "d")], [("e", "f")]], None, [None], [], []],
type=pa.list_(pa.map_(pa.string(), pa.string())),
)

names = [
"map_list_bool",
"map_list_uint8",
"map_list_int8",
"map_list_uint16",
"map_list_int16",
"map_list_uint32",
"map_list_int32",
"map_list_uint64",
"map_list_int64",
"map_list_float32",
"map_list_float64",
"map_map_bool",
"map_map_uint8",
"map_map_int8",
"map_map_uint16",
"map_map_int16",
"map_map_uint32",
"map_map_int32",
"map_map_uint64",
"map_map_int64",
"map_map_float32",
"map_map_float64",
"map_map_string",
"list_list_string",
"list_map_string",
]

locals_ = locals()
table = pa.table([locals_[x] for x in names], names=names)

HERE = pathlib.Path(__file__).parent
pq.write_table(
table,
HERE / "ogr/data/parquet/nested_types.parquet",
compression="NONE",
row_group_size=3,
)


if __name__ == "__main__":
generate_test_parquet()
generate_all_geoms_parquet()
generate_parquet_wkt_with_dict()
generate_nested_types()
Binary file added autotest/ogr/data/parquet/nested_types.parquet
Binary file not shown.
Loading

0 comments on commit bc2e42f

Please sign in to comment.