Skip to content

Commit 8d7d778

Browse files
authored
perf: Faster st_geometrytype() function (#90)
1 parent bced6b3 commit 8d7d778

File tree

2 files changed

+60
-25
lines changed

2 files changed

+60
-25
lines changed

rust/sedona-functions/src/executor.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,30 @@ impl GeometryFactory for WkbGeometryFactory {
246246
}
247247
}
248248

249+
/// A [GeometryFactory] whose geometry type are raw WKB bytes
250+
///
251+
/// Using this geometry factory iterates over items as references to the raw underlying
252+
/// bytes, which is useful for writing optimized kernels that do not need the full buffer to
253+
/// be validated and/or parsed.
254+
#[derive(Default)]
255+
pub struct WkbBytesFactory {}
256+
257+
impl GeometryFactory for WkbBytesFactory {
258+
type Geom<'a> = &'a [u8];
259+
260+
fn try_from_wkb<'a>(&self, wkb_bytes: &'a [u8]) -> Result<Self::Geom<'a>> {
261+
Ok(wkb_bytes)
262+
}
263+
}
264+
265+
/// Alias for an executor that iterates over geometries in their raw [Wkb] bytes.
266+
///
267+
/// This [GenericExecutor] implementation provides more optimization opportunities,
268+
/// but it requires additional manual processing of the raw [Wkb] bytes compared to
269+
/// the [WkbExecutor].
270+
pub(crate) type WkbBytesExecutor<'a, 'b> =
271+
GenericExecutor<'a, 'b, WkbBytesFactory, WkbBytesFactory>;
272+
249273
/// Trait for iterating over a container type as geometry scalars
250274
///
251275
/// Currently the only scalar type supported is [Wkb]; however, for future

rust/sedona-functions/src/st_geometrytype.rs

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,16 @@
1616
// under the License.
1717
use std::sync::Arc;
1818

19-
use crate::executor::WkbExecutor;
19+
use crate::executor::WkbBytesExecutor;
2020
use arrow_array::builder::StringBuilder;
2121
use arrow_schema::DataType;
2222
use datafusion_common::error::Result;
2323
use datafusion_expr::{
2424
scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility,
2525
};
26-
use geo_traits::GeometryTrait;
2726
use sedona_common::sedona_internal_err;
2827
use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF};
2928
use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
30-
use wkb::reader::Wkb;
3129

3230
pub fn st_geometry_type_udf() -> SedonaScalarUDF {
3331
SedonaScalarUDF::new(
@@ -67,16 +65,16 @@ impl SedonaScalarKernel for STGeometryType {
6765
arg_types: &[SedonaType],
6866
args: &[ColumnarValue],
6967
) -> Result<ColumnarValue> {
70-
let executor = WkbExecutor::new(arg_types, args);
71-
let min_output_size = "POINT".len() * executor.num_iterations();
68+
let executor = WkbBytesExecutor::new(arg_types, args);
69+
let min_output_size = "ST_POINT".len() * executor.num_iterations();
7270
let mut builder = StringBuilder::with_capacity(executor.num_iterations(), min_output_size);
7371

74-
// We can do quite a lot better than this with some vectorized WKB processing,
75-
// but for now we just do a slow iteration
76-
executor.execute_wkb_void(|maybe_item| {
77-
match maybe_item {
78-
Some(item) => {
79-
builder.append_option(invoke_scalar(&item)?);
72+
// Iterate over raw WKB bytes for faster type inference
73+
executor.execute_wkb_void(|maybe_bytes| {
74+
match maybe_bytes {
75+
Some(bytes) => {
76+
let name = infer_geometry_type_name(bytes)?;
77+
builder.append_value(name);
8078
}
8179
None => builder.append_null(),
8280
}
@@ -87,20 +85,33 @@ impl SedonaScalarKernel for STGeometryType {
8785
}
8886
}
8987

90-
fn invoke_scalar(item: &Wkb) -> Result<Option<String>> {
91-
match item.as_type() {
92-
geo_traits::GeometryType::Point(_) => Ok(Some("ST_Point".to_string())),
93-
geo_traits::GeometryType::LineString(_) => Ok(Some("ST_LineString".to_string())),
94-
geo_traits::GeometryType::Polygon(_) => Ok(Some("ST_Polygon".to_string())),
95-
geo_traits::GeometryType::MultiPoint(_) => Ok(Some("ST_MultiPoint".to_string())),
96-
geo_traits::GeometryType::MultiLineString(_) => Ok(Some("ST_MultiLineString".to_string())),
97-
geo_traits::GeometryType::MultiPolygon(_) => Ok(Some("ST_MultiPolygon".to_string())),
98-
geo_traits::GeometryType::GeometryCollection(_) => {
99-
Ok(Some("ST_GeometryCollection".to_string()))
100-
}
101-
102-
// Other geometry types in geo that we should not get here: Rect, Triangle, Line
103-
_ => sedona_internal_err!("unexpected geometry type"),
88+
/// Fast-path inference of geometry type name from raw WKB bytes
89+
/// An error will be thrown for invalid WKB bytes input
90+
///
91+
/// Spec: https://libgeos.org/specifications/wkb/
92+
#[inline]
93+
fn infer_geometry_type_name(buf: &[u8]) -> Result<&'static str> {
94+
if buf.len() < 5 {
95+
return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len());
96+
}
97+
98+
let byte_order = buf[0];
99+
let code = match byte_order {
100+
0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
101+
1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
102+
other => return sedona_internal_err!("Unexpected byte order: {other}"),
103+
};
104+
105+
// Only low 3 bits is for the base type, high bits include additional info
106+
match code & 0x7 {
107+
1 => Ok("ST_Point"),
108+
2 => Ok("ST_LineString"),
109+
3 => Ok("ST_Polygon"),
110+
4 => Ok("ST_MultiPoint"),
111+
5 => Ok("ST_MultiLineString"),
112+
6 => Ok("ST_MultiPolygon"),
113+
7 => Ok("ST_GeometryCollection"),
114+
_ => sedona_internal_err!("WKB type code out of range. Got: {}", code),
104115
}
105116
}
106117

0 commit comments

Comments
 (0)