Skip to content

Commit 3177790

Browse files
committed
Validate types and geometries #26
1 parent 2473338 commit 3177790

File tree

4 files changed

+79
-3
lines changed

4 files changed

+79
-3
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1212
- Converter for EuroCrops France dataset
1313
- `fiboa create-geojson`: Show conversion progress
1414
- `fiboa jsonschema` and `fiboa validate`: Support `geometryTypes` for `geometry` data type in GeoJSON
15+
- `fiboa validate`: Basic validation for geometries in GeoParquet files
1516

1617
### Changed
1718

@@ -32,6 +33,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
3233
- `fiboa validate`:
3334
- Is more robust against invalid collections and doesn't abort if not needed
3435
- Check NULL values correctly in case of arrays
36+
- Validate geometry column metadata
3537
- `fiboa create-geojson`:
3638
- Handles GeoParquet bbox correctly
3739
- Converts numpy arrays

fiboa_cli/types.py

+28-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pyarrow.types as pat
33
import pandas as pd
44
import numpy as np
5+
from shapely.geometry.base import BaseGeometry
56

67
def is_enum(schema):
78
return isinstance(schema.get("enum"), list)
@@ -115,6 +116,7 @@ def get_pyarrow_type(schema):
115116
pa_subtype = get_pyarrow_type(schema.get("items", {}))
116117
return pa.list_(pa_subtype)
117118
elif dtype == "object":
119+
# todo: add patternProperties
118120
additonal_properties = schema.get("additionalProperties", False)
119121
if additonal_properties is True:
120122
raise Exception("Additional properties for objects are not supported")
@@ -154,7 +156,6 @@ def get_pyarrow_type_for_geopandas(dtype):
154156
elif dtype == "datetime64":
155157
return pa.timestamp("ms", tz="UTC")
156158
else:
157-
print(dtype)
158159
return None
159160

160161

@@ -174,10 +175,34 @@ def get_pyarrow_type_for_geopandas(dtype):
174175
"binary": pat.is_binary,
175176
"string": pat.is_string,
176177
"array": pat.is_list,
177-
"object": pat.is_map,
178+
"object": pat.is_struct,
178179
"date": pat.is_date32,
179180
"date-time": pat.is_timestamp,
180-
"geometry": pat.is_binary, # todo: check more?
181+
"geometry": pat.is_binary,
182+
"bounding-box": pat.is_struct
183+
}
184+
185+
186+
# checks pyarrow datatypes
187+
PYTHON_TYPES = {
188+
"boolean": bool,
189+
"int8": int,
190+
"uint8": int,
191+
"int16": int,
192+
"uint16": int,
193+
"int32": int,
194+
"uint32": int,
195+
"int64": int,
196+
"uint64": int,
197+
"float": float,
198+
"double": float,
199+
"binary": None, # todo
200+
"string": str,
201+
"array": (list, np.ndarray),
202+
"object": dict,
203+
"date": None, # todo
204+
"date-time": None, # todo
205+
"geometry": BaseGeometry,
181206
"bounding-box": None # todo
182207
}
183208

fiboa_cli/validate.py

+22
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ def validate_parquet(file, config):
259259
if not pat.is_string(pq_field.key_type):
260260
log(f"{key}: Map keys must be strings", "error")
261261
valid = False
262+
elif dtype == "geometry":
263+
valid = validate_geometry_column(key, prop_schema, geo, valid)
262264

263265
# Validate data of the column
264266
if gdf is not None:
@@ -275,6 +277,26 @@ def validate_parquet(file, config):
275277
return valid
276278

277279

280+
def validate_geometry_column(key, prop_schema, geo, valid = True):
281+
columns = geo.get("columns", {})
282+
if key not in columns:
283+
log(f"{key}: Geometry column not found in GeoParquet metadata", "error")
284+
valid = False
285+
286+
schema_geo_types = prop_schema.get("geometryTypes", [])
287+
schema_geo_types.sort()
288+
if len(schema_geo_types) > 0:
289+
gp_geo_types = columns[key].get("geometry_types", [])
290+
gp_geo_types.sort()
291+
if len(gp_geo_types) == 0:
292+
log(f"{key}: No geometry types specified in GeoParquet metadata", "warning")
293+
294+
if schema_geo_types != gp_geo_types:
295+
log(f"{key}: GeoParquet geometry types differ, is {gp_geo_types} but must be {schema_geo_types}", "error")
296+
valid = False
297+
298+
return valid
299+
278300
# todo: use stac_validator instead of our own validation routine
279301
def validate_colletion_schema(obj):
280302
if "stac_version" in obj:

fiboa_cli/validate_data.py

+27
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import re
22
import pandas as pd
3+
34
from urllib.parse import urlparse
5+
from shapely.geometry.base import BaseGeometry
6+
from shapely.validation import explain_validity
7+
8+
from .types import PYTHON_TYPES
49

510
REGEX_EMAIL = re.compile("[^@]+@[^@]+\.[^@]+")
611
REGEX_UUID = re.compile("^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\Z")
@@ -12,6 +17,11 @@ def validate_column(data, rules):
1217
# Skip validation for NaN values or implement special handling if required
1318
continue
1419

20+
dtype = rules.get('type')
21+
python_type = PYTHON_TYPES.get(dtype)
22+
if python_type is not None and not isinstance(value, python_type):
23+
return [f"Value '{value}' is not of type {dtype}."]
24+
1525
if isinstance(value, str):
1626
issues = validate_string(value, rules)
1727
elif isinstance(value, (int, float)):
@@ -20,6 +30,8 @@ def validate_column(data, rules):
2030
issues = validate_array(value, rules)
2131
elif isinstance(value, dict):
2232
issues = validate_object(value, rules)
33+
elif isinstance(value, BaseGeometry):
34+
issues = validate_geometry(value, rules)
2335
else:
2436
continue
2537

@@ -28,6 +40,21 @@ def validate_column(data, rules):
2840

2941
return []
3042

43+
# Geometry validation
44+
def validate_geometry(value, rules):
45+
issues = []
46+
47+
geom_types = rules.get("geometryTypes", [])
48+
if len(geom_types) > 0 and value.geom_type not in geom_types:
49+
allowed = ", ".join(geom_types)
50+
issues.append(f"Geometry type '{value.geom_type}' is not one of the allowed types: {allowed}")
51+
52+
why = explain_validity(value)
53+
if why != 'Valid Geometry':
54+
issues.append(f"Geometry {value} is not valid: {why}")
55+
56+
return issues
57+
3158
# String validation
3259
def validate_string(value, rules):
3360
issues = []

0 commit comments

Comments
 (0)