Skip to content

Commit 8846bf4

Browse files
committed
Improved data type conversion, enum and struct support #5
1 parent 26e2110 commit 8846bf4

9 files changed

+227
-143
lines changed

CHANGELOG.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99

1010
### Added
1111

12-
- ...
12+
- Support for enums
1313

1414
### Changed
1515

@@ -25,7 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2525

2626
### Fixed
2727

28-
- `fiboa create-geoparquet`: Handle column conversion more gracaefully
28+
- `fiboa create-geoparquet`: Handle column conversion more gracefully
2929
- `fiboa validate`: Don't fail collection test if something unexpected happened
3030
- `fiboa create-geojson`: Option `-f` doesn't need a value any longer
3131

fiboa_cli/const.py

+1-101
Original file line numberDiff line numberDiff line change
@@ -1,104 +1,3 @@
1-
import pyarrow as pa
2-
import pyarrow.types as pat
3-
import pandas as pd
4-
5-
# fiboa datatypes to geopandas datatypes
6-
# todo: check whether it's better to use nullable Ints (e.g. Int64 instead of int64)
7-
GP_TYPE_MAP = {
8-
"boolean": "bool",
9-
"int8": "int8", # todo add enum support - start
10-
"uint8": "uint8",
11-
"int16": "int16",
12-
"uint16": "uint16",
13-
"int32": "int32",
14-
"uint32": "uint32",
15-
"int64": "int64",
16-
"uint64": "uint64", # todo add enum support - end
17-
"float": "float32",
18-
"double": "float64",
19-
"binary": "bytearray", # todo: check
20-
"string": "str", # todo: support enum
21-
"array": None, # todo: object?
22-
"object": "object",
23-
"date": "datetime64[D]",
24-
"date-time": lambda x: pd.to_datetime(x),
25-
"geometry": None, # not a column, don't convert geometry
26-
"bounding-box": None # todo
27-
}
28-
29-
# fiboa datatypes to pyarrow datatypes
30-
PA_TYPE_MAP = {
31-
"boolean": pa.bool_(),
32-
"int8": pa.int8(), # todo add enum support - start
33-
"uint8": pa.uint8(),
34-
"int16": pa.int16(),
35-
"uint16": pa.uint16(),
36-
"int32": pa.int32(),
37-
"uint32": pa.uint32(),
38-
"int64": pa.int64(),
39-
"uint64": pa.uint64(), # todo add enum support - end
40-
"float": pa.float32(),
41-
"double": pa.float64(),
42-
"binary": pa.binary(),
43-
"string": pa.string(), # todo add enum support
44-
"array": lambda type: pa.list_(type),
45-
"object": None, # todo: lambda type: pa.map_(pa.string(), type)
46-
"date": pa.date32(),
47-
"date-time": pa.timestamp("ms", tz="UTC"),
48-
"geometry": pa.binary(),
49-
"bounding-box": None # todo
50-
}
51-
52-
# geopandas datatypes to pyarrow datatypes
53-
GP_TO_PA_TYPE_MAP = {
54-
"string": pa.string(), # todo add enum support
55-
"|S0": pa.string(), # todo
56-
"<U0": pa.string(), # todo
57-
"bool": pa.bool_(),
58-
"int8": pa.int8(), # todo add enum support - start
59-
"uint8": pa.uint8(),
60-
"int16": pa.int16(),
61-
"uint16": pa.uint16(),
62-
"int32": pa.int32(),
63-
"uint32": pa.uint32(),
64-
"int64": pa.int64(),
65-
"uint64": pa.uint64(), # todo add enum support - end
66-
"float16": pa.float16(),
67-
"float32": pa.float32(),
68-
"float64": pa.float64(),
69-
"float128": None, # todo
70-
"complex64": None, # todo
71-
"complex128": None, # todo
72-
"complex256": None, # todo
73-
"object": pa.string(),
74-
"datetime64": pa.timestamp("ms", tz="UTC"),
75-
"record": None, # todo
76-
"timedelta64": None # todo
77-
}
78-
79-
# checks pyarrow datatypes
80-
PA_TYPE_CHECK = {
81-
"boolean": pat.is_boolean,
82-
"int8": pat.is_int8,
83-
"uint8": pat.is_uint8,
84-
"int16": pat.is_int16,
85-
"uint16": pat.is_uint16,
86-
"int32": pat.is_int32,
87-
"uint32": pat.is_uint32,
88-
"int64": pat.is_int64,
89-
"uint64": pat.is_uint64,
90-
"float": pat.is_float32,
91-
"double": pat.is_float64,
92-
"binary": pat.is_binary,
93-
"string": pat.is_string,
94-
"array": pat.is_list,
95-
"object": pat.is_map,
96-
"date": pat.is_date32,
97-
"date-time": pat.is_timestamp,
98-
"geometry": pat.is_binary, # todo: check more?
99-
"bounding-box": None # todo
100-
}
101-
1021
LOG_STATUS_COLOR = {
1032
"info": "white",
1043
"warning": "yellow",
@@ -109,3 +8,4 @@
1098
SUPPORTED_PROTOCOLS = ["http", "https", "s3", "gs"]
1109

11110
STAC_COLLECTION_SCHEMA = "http://schemas.stacspec.org/v1.0.0/collection-spec/json-schema/collection.json"
11+
STAC_TABLE_EXTENSION = "https://stac-extensions.github.io/table/v1.2.0/schema.json"

fiboa_cli/convert_utils.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from .const import STAC_TABLE_EXTENSION
12
from .version import fiboa_version
23
from .util import log, download_file, get_fs, to_iso8601
34
from .create_geoparquet import create_geoparquet
@@ -9,8 +10,6 @@
910
import geopandas as gpd
1011
import pandas as pd
1112

12-
STAC_TABLE_EXTENSION = "https://stac-extensions.github.io/table/v1.2.0/schema.json"
13-
1413
def convert(
1514
output_file, cache_file,
1615
url, columns,

fiboa_cli/create_geojson.py

-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
import os
33
import pandas as pd
44

5-
from geopandas import GeoDataFrame
6-
75
from .util import load_parquet_data, load_parquet_schema, parse_metadata, to_iso8601
86

97

fiboa_cli/parquet.py

+27-33
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from geopandas import GeoDataFrame
55
from shapely.geometry import shape
66

7-
from .const import PA_TYPE_MAP, GP_TYPE_MAP, GP_TO_PA_TYPE_MAP
7+
from .types import get_geopandas_dtype, get_pyarrow_type_for_geopandas, get_pyarrow_field
88
from .util import log, load_fiboa_schema, load_file, merge_schemas
99
from .geopandas import to_parquet
1010

@@ -39,23 +39,31 @@ def create_parquet(data, columns, collection, output_file, config, missing_schem
3939
# Define the fields for the schema
4040
pq_fields = []
4141
for name in columns:
42+
required_props = schemas.get("required", [])
4243
properties = schemas.get("properties", {})
44+
required = name in required_props
4345
if name in properties:
4446
prop_schema = properties[name]
45-
pa_type = create_type(prop_schema)
46-
nullable = name not in schemas.get("required", [])
47-
field = pa.field(name, pa_type, nullable = nullable)
47+
try:
48+
field = get_pyarrow_field(name, schema = prop_schema, required = required)
49+
except Exception as e:
50+
log(f"{name}: Skipped - {e}", "warning")
4851
else:
4952
pd_type = str(data[name].dtype) # pandas data type
50-
pa_type = GP_TO_PA_TYPE_MAP.get(pd_type) # pyarrow data type
51-
if pa_type is not None:
52-
log(f"{name}: No schema defined, converting {pd_type} to nullable {pa_type}", "warning")
53-
field = pa.field(name, pa_type, nullable = True)
54-
else:
55-
log(f"{name}: No schema defined and converter doesn't support {pd_type}, skipping field", "warning")
53+
try:
54+
pa_type = get_pyarrow_type_for_geopandas(pd_type, required) # pyarrow data type
55+
if pa_type is not None:
56+
log(f"{name}: No schema defined, converting {pd_type} to nullable {pa_type}", "warning")
57+
field = get_pyarrow_field(name, pa_type = pa_type)
58+
except Exception as e:
59+
log(f"{name}: Skipped - {e}", "warning")
5660
continue
5761

58-
pq_fields.append(field)
62+
if field is None:
63+
log(f"{name}: Skipped - invalid data type", "warning")
64+
continue
65+
else:
66+
pq_fields.append(field)
5967

6068
# Define the schema for the Parquet file
6169
pq_schema = pa.schema(pq_fields)
@@ -96,40 +104,26 @@ def features_to_dataframe(features, columns):
96104

97105
def update_dataframe(data, columns, schema):
98106
# Convert the data to the correct types
107+
properties = schema.get("properties", {})
108+
required_props = schema.get("required", [])
99109
for column in columns:
100-
if column not in schema["properties"]:
110+
if column not in properties:
101111
continue
102-
dtype = schema["properties"][column].get("type")
112+
schema = properties[column]
113+
dtype = schema.get("type")
103114
if dtype == "geometry":
104115
continue
105116

106-
gp_type = GP_TYPE_MAP.get(dtype)
117+
required = column in required_props
118+
gp_type = get_geopandas_dtype(dtype, required, schema)
107119
try:
108120
if gp_type is None:
109121
log(f"{column}: No type conversion available for {dtype}")
110122
elif callable(gp_type):
111123
data[column] = gp_type(data[column])
112124
else:
113-
data[column] = data[column].astype(gp_type)
125+
data[column] = data[column].astype(gp_type, copy = False)
114126
except Exception as e:
115127
log(f"{column}: Can't convert to {dtype}: {e}", "warning")
116128

117129
return data
118-
119-
def create_type(schema):
120-
dtype = schema.get("type")
121-
if dtype is None:
122-
raise Exception("No type specified")
123-
124-
pa_type = PA_TYPE_MAP.get(dtype)
125-
if pa_type is None:
126-
raise Exception(f"{dtype} is not supported yet")
127-
elif callable(pa_type):
128-
if dtype == "array":
129-
pa_subtype = create_type(schema["items"])
130-
pa_type = pa_type(pa_subtype)
131-
elif dtype == "object":
132-
log(f"Creation of object-typed properties not supported yet", "warning")
133-
pass # todo
134-
135-
return pa_type

0 commit comments

Comments
 (0)