|
4 | 4 | from geopandas import GeoDataFrame
|
5 | 5 | from shapely.geometry import shape
|
6 | 6 |
|
7 |
| -from .const import PA_TYPE_MAP, GP_TYPE_MAP, GP_TO_PA_TYPE_MAP |
| 7 | +from .types import get_geopandas_dtype, get_pyarrow_type_for_geopandas, get_pyarrow_field |
8 | 8 | from .util import log, load_fiboa_schema, load_file, merge_schemas
|
9 | 9 | from .geopandas import to_parquet
|
10 | 10 |
|
@@ -39,23 +39,31 @@ def create_parquet(data, columns, collection, output_file, config, missing_schem
|
39 | 39 | # Define the fields for the schema
|
40 | 40 | pq_fields = []
|
41 | 41 | for name in columns:
|
| 42 | + required_props = schemas.get("required", []) |
42 | 43 | properties = schemas.get("properties", {})
|
| 44 | + required = name in required_props |
43 | 45 | if name in properties:
|
44 | 46 | prop_schema = properties[name]
|
45 |
| - pa_type = create_type(prop_schema) |
46 |
| - nullable = name not in schemas.get("required", []) |
47 |
| - field = pa.field(name, pa_type, nullable = nullable) |
| 47 | + try: |
| 48 | + field = get_pyarrow_field(name, schema = prop_schema, required = required) |
| 49 | + except Exception as e: |
| 50 | + log(f"{name}: Skipped - {e}", "warning") |
48 | 51 | else:
|
49 | 52 | pd_type = str(data[name].dtype) # pandas data type
|
50 |
| - pa_type = GP_TO_PA_TYPE_MAP.get(pd_type) # pyarrow data type |
51 |
| - if pa_type is not None: |
52 |
| - log(f"{name}: No schema defined, converting {pd_type} to nullable {pa_type}", "warning") |
53 |
| - field = pa.field(name, pa_type, nullable = True) |
54 |
| - else: |
55 |
| - log(f"{name}: No schema defined and converter doesn't support {pd_type}, skipping field", "warning") |
| 53 | + try: |
| 54 | + pa_type = get_pyarrow_type_for_geopandas(pd_type, required) # pyarrow data type |
| 55 | + if pa_type is not None: |
| 56 | + log(f"{name}: No schema defined, converting {pd_type} to nullable {pa_type}", "warning") |
| 57 | + field = get_pyarrow_field(name, pa_type = pa_type) |
| 58 | + except Exception as e: |
| 59 | + log(f"{name}: Skipped - {e}", "warning") |
56 | 60 | continue
|
57 | 61 |
|
58 |
| - pq_fields.append(field) |
| 62 | + if field is None: |
| 63 | + log(f"{name}: Skipped - invalid data type", "warning") |
| 64 | + continue |
| 65 | + else: |
| 66 | + pq_fields.append(field) |
59 | 67 |
|
60 | 68 | # Define the schema for the Parquet file
|
61 | 69 | pq_schema = pa.schema(pq_fields)
|
@@ -96,40 +104,26 @@ def features_to_dataframe(features, columns):
|
96 | 104 |
|
97 | 105 | def update_dataframe(data, columns, schema):
|
98 | 106 | # Convert the data to the correct types
|
| 107 | + properties = schema.get("properties", {}) |
| 108 | + required_props = schema.get("required", []) |
99 | 109 | for column in columns:
|
100 |
| - if column not in schema["properties"]: |
| 110 | + if column not in properties: |
101 | 111 | continue
|
102 |
| - dtype = schema["properties"][column].get("type") |
| 112 | + schema = properties[column] |
| 113 | + dtype = schema.get("type") |
103 | 114 | if dtype == "geometry":
|
104 | 115 | continue
|
105 | 116 |
|
106 |
| - gp_type = GP_TYPE_MAP.get(dtype) |
| 117 | + required = column in required_props |
| 118 | + gp_type = get_geopandas_dtype(dtype, required, schema) |
107 | 119 | try:
|
108 | 120 | if gp_type is None:
|
109 | 121 | log(f"{column}: No type conversion available for {dtype}")
|
110 | 122 | elif callable(gp_type):
|
111 | 123 | data[column] = gp_type(data[column])
|
112 | 124 | else:
|
113 |
| - data[column] = data[column].astype(gp_type) |
| 125 | + data[column] = data[column].astype(gp_type, copy = False) |
114 | 126 | except Exception as e:
|
115 | 127 | log(f"{column}: Can't convert to {dtype}: {e}", "warning")
|
116 | 128 |
|
117 | 129 | return data
|
118 |
| - |
119 |
| -def create_type(schema): |
120 |
| - dtype = schema.get("type") |
121 |
| - if dtype is None: |
122 |
| - raise Exception("No type specified") |
123 |
| - |
124 |
| - pa_type = PA_TYPE_MAP.get(dtype) |
125 |
| - if pa_type is None: |
126 |
| - raise Exception(f"{dtype} is not supported yet") |
127 |
| - elif callable(pa_type): |
128 |
| - if dtype == "array": |
129 |
| - pa_subtype = create_type(schema["items"]) |
130 |
| - pa_type = pa_type(pa_subtype) |
131 |
| - elif dtype == "object": |
132 |
| - log(f"Creation of object-typed properties not supported yet", "warning") |
133 |
| - pass # todo |
134 |
| - |
135 |
| - return pa_type |
0 commit comments