From 829002bfa4c6100eedef68562e3b85a4fd0d5d0a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 11 May 2024 16:41:38 +0800 Subject: [PATCH 1/2] WIP: Add geometry type --- specification/ORCv2.md | 89 ++++++++++++++++++++++++ src/main/proto/orc/proto/orc_proto.proto | 59 ++++++++++++++++ 2 files changed, 148 insertions(+) diff --git a/specification/ORCv2.md b/specification/ORCv2.md index 3411485..14721c7 100644 --- a/specification/ORCv2.md +++ b/specification/ORCv2.md @@ -261,6 +261,7 @@ message Type { VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; + GEOMETRY = 19; } // the kind of this type required Kind kind = 1; @@ -273,6 +274,61 @@ message Type { // the precision and scale for decimal optional uint32 precision = 5; optional uint32 scale = 6; + repeated StringPair attributes = 7; + // the attributes associated with the geometry type + optional GeometryType geometry = 8; +} +``` + +#### Geometry Type + +Geometry type requires additional information as described in the GeometryType +message below. These attributes limit the scope of geospatial features that +we can support for now. For example, only 2D geometries and OGC:CRS84 are +supported. + +``` +message GeometryType { + // A geometry can be any of the following geospatial subtype list, + // which is taken from the OGC (Open Geospatial Consortium) + // SFA (Simple Feature Access) Part 1- Common Architecture. + // If subtype is set, all values in the column must be of the same subtype; + // otherwise, the column may contain value of any subtype. + enum SubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRYCOLLECTION = 6; + } + optional SubType subtype = 1; + + // The dimension of the geometry. + // For now only 2D geometry is supported and the value must be 2 if set. + optional int32 dimension = 2; + + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. + // For now only OGC:CRS84 is supported. + optional string crs = 3; + + // Interpretation for edges, i.e. whether the edge between points + // represent a straight cartesian line or the shortest line on the sphere + enum Edges { + PLANAR = 0; + // SPHERICAL = 1; + } + optional Edges edges = 4; + + enum GeospatialEncoding { + // The geometry is stored as a Well-Known Binary (WKB) binary data. + // This is a well-known and popular binary representation regulated by + // the Open Geospatial Consortium (OGC). + WKB = 0; + } + optional GeospatialEncoding encoding = 5; } ``` @@ -301,6 +357,9 @@ message ColumnStatistics { optional BinaryStatistics binaryStatistics = 8; optional TimestampStatistics timestampStatistics = 9; optional bool hasNull = 10; + optional uint64 bytes_on_disk = 11; + optional CollectionStatistics collection_statistics = 12; + optional GeometryStatistics geometry_statistics = 13; } ``` @@ -395,6 +454,21 @@ message BinaryStatistics { } ``` +Geometry columns store coordinates of the bounding box built from all values. +For example, all 2D geometries are regarded as a collection of coordinate +(x, y). POINT has one coordinate, LINESTRING has two coordinates, and POLYGON +might have three or more coordinates. A bounding box is the combination of +x_min, x_max, y_min, and y_max of all coordinates from all geometry values. + +``` +message GeometryStatistics { + optional double min_x = 1; + optional double max_x = 2; + optional double min_y = 3; + optional double max_y = 4; +} +``` + ### User Metadata The user can add arbitrary key/value pairs to an ORC file as it is @@ -1233,6 +1307,21 @@ Encoding | Stream Kind | Optional | Contents DIRECT | PRESENT | Yes | Boolean RLE | DIRECT | No | Byte RLE +## Geometry Columns + +Geometry data is encoded with a PRESENT stream, a DATA stream that records +the WKB-encoded geometry data as binary, and a LENGTH stream that records +the number of bytes per a value. + +Encoding | Stream Kind | Optional | Contents +:------------ | :-------------- | :------- | :------- +DIRECT | PRESENT | Yes | Boolean RLE + | DATA | No | Binary contents + | LENGTH | No | Unsigned Integer RLE v1 +DIRECT_V2 | PRESENT | Yes | Boolean RLE + | DATA | No | Binary contents + | LENGTH | No | Unsigned Integer RLE v2 + # Indexes ## Row Group Index diff --git a/src/main/proto/orc/proto/orc_proto.proto b/src/main/proto/orc/proto/orc_proto.proto index 16c5523..c4cc4ac 100644 --- a/src/main/proto/orc/proto/orc_proto.proto +++ b/src/main/proto/orc/proto/orc_proto.proto @@ -84,6 +84,19 @@ message CollectionStatistics { optional uint64 total_children = 3; } +// Statistics for geometry, which are coordinates of bounding box built from +// geometry objects in the same column. For example, all 2D geometries are +// regarded as a collection of coordinate (x, y). POINT has one coordinate, +// LINESTRING has two coordinates, and POLYGON might have three or more +// coordinates. A bounding box is the combination of x_min, x_max, y_min, and +// y_max of all coordinates from all geometry values. +message GeometryStatistics { + optional double min_x = 1; + optional double max_x = 2; + optional double min_y = 3; + optional double max_y = 4; +} + message ColumnStatistics { optional uint64 number_of_values = 1; optional IntegerStatistics int_statistics = 2; @@ -97,6 +110,7 @@ message ColumnStatistics { optional bool has_null = 10; optional uint64 bytes_on_disk = 11; optional CollectionStatistics collection_statistics = 12; + optional GeometryStatistics geometry_statistics = 13; } message RowIndexEntry { @@ -195,6 +209,49 @@ message StringPair { optional string value = 2; } +message GeometryType { + // A geometry can be any of the following geospatial subtype list, + // which is taken from the OGC (Open Geospatial Consortium) + // SFA (Simple Feature Access) Part 1- Common Architecture. + // If subtype is set, all values in the column must be of the same subtype; + // otherwise, the column may contain value of any subtype. + enum SubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRYCOLLECTION = 6; + } + optional SubType subtype = 1; + + // The dimension of the geometry. + // For now only 2D geometry is supported and the value must be 2 if set. + optional int32 dimension = 2; + + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. + // For now only OGC:CRS84 is supported. + optional string crs = 3; + + // Interpretation for edges, i.e. whether the edge between points + // represent a straight cartesian line or the shortest line on the sphere + enum Edges { + PLANAR = 0; + // SPHERICAL = 1; + } + optional Edges edges = 4; + + enum GeospatialEncoding { + // The geometry is stored as a Well-Known Binary (WKB) binary data. + // This is a well-known and popular binary representation regulated by + // the Open Geospatial Consortium (OGC). + WKB = 0; + } + optional GeospatialEncoding encoding = 5; +} + message Type { enum Kind { BOOLEAN = 0; @@ -216,6 +273,7 @@ message Type { VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; + GEOMETRY = 19; } optional Kind kind = 1; repeated uint32 subtypes = 2 [packed=true]; @@ -224,6 +282,7 @@ message Type { optional uint32 precision = 5; optional uint32 scale = 6; repeated StringPair attributes = 7; + optional GeometryType geometry = 8; } message StripeInformation { From 47311c21315a3e79c7ff894f796ef7132a92ebab Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 22 Aug 2024 13:59:16 +0800 Subject: [PATCH 2/2] reflect latest change from parquet proposal --- specification/ORCv2.md | 176 +++++++++++++++++------ src/main/proto/orc/proto/orc_proto.proto | 170 ++++++++++++++++------ 2 files changed, 256 insertions(+), 90 deletions(-) diff --git a/specification/ORCv2.md b/specification/ORCv2.md index 14721c7..d3fbd31 100644 --- a/specification/ORCv2.md +++ b/specification/ORCv2.md @@ -284,51 +284,81 @@ message Type { Geometry type requires additional information as described in the GeometryType message below. These attributes limit the scope of geospatial features that -we can support for now. For example, only 2D geometries and OGC:CRS84 are -supported. +we can support for now. ``` message GeometryType { - // A geometry can be any of the following geospatial subtype list, - // which is taken from the OGC (Open Geospatial Consortium) - // SFA (Simple Feature Access) Part 1- Common Architecture. - // If subtype is set, all values in the column must be of the same subtype; - // otherwise, the column may contain value of any subtype. - enum SubType { - POINT = 0; - LINESTRING = 1; - POLYGON = 2; - MULTIPOINT = 3; - MULTILINESTRING = 4; - MULTIPOLYGON = 5; - GEOMETRYCOLLECTION = 6; + enum GeometryEncoding { + // Well-known binary (WKB) representations of geometries. + // + // To be clear, we follow the same rule of WKB and coordinate axis order + // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM + // and the standard geometry types (Point, LineString, Polygon, MultiPoint, + // MultiLineString, MultiPolygon, and GeometryCollection). + // + // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + WKB = 0; } - optional SubType subtype = 1; - - // The dimension of the geometry. - // For now only 2D geometry is supported and the value must be 2 if set. - optional int32 dimension = 2; + required GeospatialEncoding encoding = 1; - // Coordinate Reference System, i.e. mapping of how coordinates refer to - // precise locations on earth. - // For now only OGC:CRS84 is supported. - optional string crs = 3; - - // Interpretation for edges, i.e. whether the edge between points - // represent a straight cartesian line or the shortest line on the sphere + // Interpretation for edges of non-point geometry objects, i.e. whether the + // edge between points represent a straight cartesian line or the shortest + // line on the sphere. enum Edges { PLANAR = 0; - // SPHERICAL = 1; + SPHERICAL = 1; } - optional Edges edges = 4; + required Edges edges = 2; - enum GeospatialEncoding { - // The geometry is stored as a Well-Known Binary (WKB) binary data. - // This is a well-known and popular binary representation regulated by - // the Open Geospatial Consortium (OGC). - WKB = 0; - } - optional GeospatialEncoding encoding = 5; + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. Writers are not required to set this field. + // Once crs is set, crs_encoding field below MUST be set together. + // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: + // { + // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + // "type": "GeographicCRS", + // "name": "WGS 84 longitude-latitude", + // "datum": { + // "type": "GeodeticReferenceFrame", + // "name": "World Geodetic System 1984", + // "ellipsoid": { + // "name": "WGS 84", + // "semi_major_axis": 6378137, + // "inverse_flattening": 298.257223563 + // } + // }, + // "coordinate_system": { + // "subtype": "ellipsoidal", + // "axis": [ + // { + // "name": "Geodetic longitude", + // "abbreviation": "Lon", + // "direction": "east", + // "unit": "degree" + // }, + // { + // "name": "Geodetic latitude", + // "abbreviation": "Lat", + // "direction": "north", + // "unit": "degree" + // } + // ] + // }, + // "id": { + // "authority": "OGC", + // "code": "CRS84" + // } + // } + // + optional string crs = 3; + // Encoding used in the above crs field. It MUST be set if crs field is set. + // Currently the only allowed value is "PROJJSON". + optional string crs_encoding = 4; + + // Additional informative metadata about the geometry type. + // Recommended to write a JSON-encoded UTF-8 string. + optional string metadata = 5; } ``` @@ -454,18 +484,72 @@ message BinaryStatistics { } ``` -Geometry columns store coordinates of the bounding box built from all values. -For example, all 2D geometries are regarded as a collection of coordinate -(x, y). POINT has one coordinate, LINESTRING has two coordinates, and POLYGON -might have three or more coordinates. A bounding box is the combination of -x_min, x_max, y_min, and y_max of all coordinates from all geometry values. +Geometry columns store optional bounding boxes, coverings and list of +geometry type codes from all values. + +``` +// Bounding box of geometries in the representation of min/max value pair of +// coordinates from each axis. Values of Z and M are omitted for 2D geometries. +// Filter pushdown on geometries are only safe for planar spatial predicate +// but it is recommended that the writer always generates bounding boxes +// regardless of whether the geometries are planar or spherical. +message BoundingBox { + required double xmin = 1; + required double xmax = 2; + required double ymin = 3; + required double ymax = 4; + optional double zmin = 5; + optional double zmax = 6; + optional double mmin = 7; + optional double mmax = 8; +} + +// A custom binary-encoded polygon or multi-polygon to represent a covering of +// geometries. For example, it may be a bounding box or an envelope when a +// bounding box cannot be built (e.g., a geometry has spherical edges, or if +// an edge of geographic coordinates crosses the antimeridian). In addition, +// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. +message Covering { + // A type of covering. Currently accepted values: "WKB". + optional string kind = 1; + // A payload specific to the kind. Below are the supported values: + // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + // covers the contents. This will be interpreted according to the same CRS + // and edges defined by the logical type. + optional bytes value = 2; +} -``` message GeometryStatistics { - optional double min_x = 1; - optional double max_x = 2; - optional double min_y = 3; - optional double max_y = 4; + // The bounding box of geometries in the column. + optional BoundingBox bbox = 1; + // List of coverings of geometries in the column. + repeated Covering coverings = 2; + // The geometry types of all geometries, or an empty array if they are not + // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + // except that values in the list are WKB (ISO variant) integer codes [2]. Table + // below shows the most common geometry types and their codes: + // + // | Type | XY | XYZ | XYM | XYZM | + // | :----------------- | :--- | :--- | :--- | :--: | + // | Point | 0001 | 1001 | 2001 | 3001 | + // | LineString | 0002 | 1002 | 2002 | 3002 | + // | Polygon | 0003 | 1003 | 2003 | 3003 | + // | MultiPoint | 0004 | 1004 | 2004 | 3004 | + // | MultiLineString | 0005 | 1005 | 2005 | 3005 | + // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + // + // In addition, the following rules are used: + // - A list of multiple values indicates that multiple geometry types are + // present (e.g. `[0003, 0006]`). + // - An empty array explicitly signals that the geometry types are not known. + // - The geometry types in the list must be unique (e.g. `[0001, 0001]` + // is not valid). + // + // Please refer to links below for more detail: + // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 + repeated int32 geometry_types = 3; } ``` diff --git a/src/main/proto/orc/proto/orc_proto.proto b/src/main/proto/orc/proto/orc_proto.proto index c4cc4ac..d57b5d8 100644 --- a/src/main/proto/orc/proto/orc_proto.proto +++ b/src/main/proto/orc/proto/orc_proto.proto @@ -84,17 +84,68 @@ message CollectionStatistics { optional uint64 total_children = 3; } -// Statistics for geometry, which are coordinates of bounding box built from -// geometry objects in the same column. For example, all 2D geometries are -// regarded as a collection of coordinate (x, y). POINT has one coordinate, -// LINESTRING has two coordinates, and POLYGON might have three or more -// coordinates. A bounding box is the combination of x_min, x_max, y_min, and -// y_max of all coordinates from all geometry values. +// Bounding box of geometries in the representation of min/max value pair of +// coordinates from each axis. Values of Z and M are omitted for 2D geometries. +// Filter pushdown on geometries are only safe for planar spatial predicate +// but it is recommended that the writer always generates bounding boxes +// regardless of whether the geometries are planar or spherical. +message BoundingBox { + required double xmin = 1; + required double xmax = 2; + required double ymin = 3; + required double ymax = 4; + optional double zmin = 5; + optional double zmax = 6; + optional double mmin = 7; + optional double mmax = 8; +} + +// A custom binary-encoded polygon or multi-polygon to represent a covering of +// geometries. For example, it may be a bounding box or an envelope when a +// bounding box cannot be built (e.g., a geometry has spherical edges, or if +// an edge of geographic coordinates crosses the antimeridian). In addition, +// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. +message Covering { + // A type of covering. Currently accepted values: "WKB". + optional string kind = 1; + // A payload specific to the kind. Below are the supported values: + // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + // covers the contents. This will be interpreted according to the same CRS + // and edges defined by the logical type. + optional bytes value = 2; +} + message GeometryStatistics { - optional double min_x = 1; - optional double max_x = 2; - optional double min_y = 3; - optional double max_y = 4; + // The bounding box of geometries in the column. + optional BoundingBox bbox = 1; + // List of coverings of geometries in the column. + repeated Covering coverings = 2; + // The geometry types of all geometries, or an empty array if they are not + // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + // except that values in the list are WKB (ISO variant) integer codes [2]. Table + // below shows the most common geometry types and their codes: + // + // | Type | XY | XYZ | XYM | XYZM | + // | :----------------- | :--- | :--- | :--- | :--: | + // | Point | 0001 | 1001 | 2001 | 3001 | + // | LineString | 0002 | 1002 | 2002 | 3002 | + // | Polygon | 0003 | 1003 | 2003 | 3003 | + // | MultiPoint | 0004 | 1004 | 2004 | 3004 | + // | MultiLineString | 0005 | 1005 | 2005 | 3005 | + // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + // + // In addition, the following rules are used: + // - A list of multiple values indicates that multiple geometry types are + // present (e.g. `[0003, 0006]`). + // - An empty array explicitly signals that the geometry types are not known. + // - The geometry types in the list must be unique (e.g. `[0001, 0001]` + // is not valid). + // + // Please refer to links below for more detail: + // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 + repeated int32 geometry_types = 3; } message ColumnStatistics { @@ -210,46 +261,77 @@ message StringPair { } message GeometryType { - // A geometry can be any of the following geospatial subtype list, - // which is taken from the OGC (Open Geospatial Consortium) - // SFA (Simple Feature Access) Part 1- Common Architecture. - // If subtype is set, all values in the column must be of the same subtype; - // otherwise, the column may contain value of any subtype. - enum SubType { - POINT = 0; - LINESTRING = 1; - POLYGON = 2; - MULTIPOINT = 3; - MULTILINESTRING = 4; - MULTIPOLYGON = 5; - GEOMETRYCOLLECTION = 6; + enum GeometryEncoding { + // Well-known binary (WKB) representations of geometries. + // + // To be clear, we follow the same rule of WKB and coordinate axis order + // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM + // and the standard geometry types (Point, LineString, Polygon, MultiPoint, + // MultiLineString, MultiPolygon, and GeometryCollection). + // + // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + WKB = 0; } - optional SubType subtype = 1; + required GeometryEncoding encoding = 1; - // The dimension of the geometry. - // For now only 2D geometry is supported and the value must be 2 if set. - optional int32 dimension = 2; - - // Coordinate Reference System, i.e. mapping of how coordinates refer to - // precise locations on earth. - // For now only OGC:CRS84 is supported. - optional string crs = 3; - - // Interpretation for edges, i.e. whether the edge between points - // represent a straight cartesian line or the shortest line on the sphere + // Interpretation for edges of non-point geometry objects, i.e. whether the + // edge between points represent a straight cartesian line or the shortest + // line on the sphere. enum Edges { PLANAR = 0; - // SPHERICAL = 1; + SPHERICAL = 1; } - optional Edges edges = 4; + required Edges edges = 2; - enum GeospatialEncoding { - // The geometry is stored as a Well-Known Binary (WKB) binary data. - // This is a well-known and popular binary representation regulated by - // the Open Geospatial Consortium (OGC). - WKB = 0; - } - optional GeospatialEncoding encoding = 5; + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. Writers are not required to set this field. + // Once crs is set, crs_encoding field below MUST be set together. + // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: + // { + // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + // "type": "GeographicCRS", + // "name": "WGS 84 longitude-latitude", + // "datum": { + // "type": "GeodeticReferenceFrame", + // "name": "World Geodetic System 1984", + // "ellipsoid": { + // "name": "WGS 84", + // "semi_major_axis": 6378137, + // "inverse_flattening": 298.257223563 + // } + // }, + // "coordinate_system": { + // "subtype": "ellipsoidal", + // "axis": [ + // { + // "name": "Geodetic longitude", + // "abbreviation": "Lon", + // "direction": "east", + // "unit": "degree" + // }, + // { + // "name": "Geodetic latitude", + // "abbreviation": "Lat", + // "direction": "north", + // "unit": "degree" + // } + // ] + // }, + // "id": { + // "authority": "OGC", + // "code": "CRS84" + // } + // } + // + optional string crs = 3; + // Encoding used in the above crs field. It MUST be set if crs field is set. + // Currently the only allowed value is "PROJJSON". + optional string crs_encoding = 4; + + // Additional informative metadata about the geometry type. + // Recommended to write a JSON-encoded UTF-8 string. + optional string metadata = 5; } message Type {