Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions rust/sedona-functions/src/executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,30 @@ impl GeometryFactory for WkbGeometryFactory {
}
}

/// A [GeometryFactory] whose geometry type are raw WKB bytes
///
/// Using this geometry factory iterates over items as references to the raw underlying
/// bytes, which is useful for writing optimized kernels that do not need the full buffer to
/// be validated and/or parsed.
#[derive(Default)]
pub struct WkbBytesFactory {}

impl GeometryFactory for WkbBytesFactory {
type Geom<'a> = &'a [u8];

fn try_from_wkb<'a>(&self, wkb_bytes: &'a [u8]) -> Result<Self::Geom<'a>> {
Ok(wkb_bytes)
}
}

/// Alias for an executor that iterates over geometries in their raw [Wkb] bytes.
///
/// This [GenericExecutor] implementation provides more optimization opportunities,
/// but it requires additional manual processing of the raw [Wkb] bytes compared to
/// the [WkbExecutor].
pub(crate) type WkbBytesExecutor<'a, 'b> =
GenericExecutor<'a, 'b, WkbBytesFactory, WkbBytesFactory>;

/// Trait for iterating over a container type as geometry scalars
///
/// Currently the only scalar type supported is [Wkb]; however, for future
Expand Down
61 changes: 36 additions & 25 deletions rust/sedona-functions/src/st_geometrytype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,16 @@
// under the License.
use std::sync::Arc;

use crate::executor::WkbExecutor;
use crate::executor::WkbBytesExecutor;
use arrow_array::builder::StringBuilder;
use arrow_schema::DataType;
use datafusion_common::error::Result;
use datafusion_expr::{
scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility,
};
use geo_traits::GeometryTrait;
use sedona_common::sedona_internal_err;
use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF};
use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
use wkb::reader::Wkb;

pub fn st_geometry_type_udf() -> SedonaScalarUDF {
SedonaScalarUDF::new(
Expand Down Expand Up @@ -67,16 +65,16 @@ impl SedonaScalarKernel for STGeometryType {
arg_types: &[SedonaType],
args: &[ColumnarValue],
) -> Result<ColumnarValue> {
let executor = WkbExecutor::new(arg_types, args);
let min_output_size = "POINT".len() * executor.num_iterations();
let executor = WkbBytesExecutor::new(arg_types, args);
let min_output_size = "ST_POINT".len() * executor.num_iterations();
let mut builder = StringBuilder::with_capacity(executor.num_iterations(), min_output_size);

// We can do quite a lot better than this with some vectorized WKB processing,
// but for now we just do a slow iteration
executor.execute_wkb_void(|maybe_item| {
match maybe_item {
Some(item) => {
builder.append_option(invoke_scalar(&item)?);
// Iterate over raw WKB bytes for faster type inference
executor.execute_wkb_void(|maybe_bytes| {
match maybe_bytes {
Some(bytes) => {
let name = infer_geometry_type_name(bytes)?;
builder.append_value(name);
}
None => builder.append_null(),
}
Expand All @@ -87,20 +85,33 @@ impl SedonaScalarKernel for STGeometryType {
}
}

fn invoke_scalar(item: &Wkb) -> Result<Option<String>> {
match item.as_type() {
geo_traits::GeometryType::Point(_) => Ok(Some("ST_Point".to_string())),
geo_traits::GeometryType::LineString(_) => Ok(Some("ST_LineString".to_string())),
geo_traits::GeometryType::Polygon(_) => Ok(Some("ST_Polygon".to_string())),
geo_traits::GeometryType::MultiPoint(_) => Ok(Some("ST_MultiPoint".to_string())),
geo_traits::GeometryType::MultiLineString(_) => Ok(Some("ST_MultiLineString".to_string())),
geo_traits::GeometryType::MultiPolygon(_) => Ok(Some("ST_MultiPolygon".to_string())),
geo_traits::GeometryType::GeometryCollection(_) => {
Ok(Some("ST_GeometryCollection".to_string()))
}

// Other geometry types in geo that we should not get here: Rect, Triangle, Line
_ => sedona_internal_err!("unexpected geometry type"),
/// Fast-path inference of geometry type name from raw WKB bytes
/// An error will be thrown for invalid WKB bytes input
///
/// Spec: https://libgeos.org/specifications/wkb/
#[inline]
fn infer_geometry_type_name(buf: &[u8]) -> Result<&'static str> {
if buf.len() < 5 {
return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len());
}

let byte_order = buf[0];
let code = match byte_order {
0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
other => return sedona_internal_err!("Unexpected byte order: {other}"),
};

// Only low 3 bits is for the base type, high bits include additional info
match code & 0x7 {
1 => Ok("ST_Point"),
2 => Ok("ST_LineString"),
3 => Ok("ST_Polygon"),
4 => Ok("ST_MultiPoint"),
5 => Ok("ST_MultiLineString"),
6 => Ok("ST_MultiPolygon"),
7 => Ok("ST_GeometryCollection"),
_ => sedona_internal_err!("WKB type code out of range. Got: {}", code),
}
}

Expand Down