Skip to content

Commit 0a17883

Browse files
committed
Add fiboa improve command #79 #21
1 parent e19454c commit 0a17883

File tree

7 files changed

+178
-17
lines changed

7 files changed

+178
-17
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99

1010
### Added
1111

12+
- Command `fiboa improve` with helpers to
13+
- change the CRS
14+
- change the GeoParquet version and compression
15+
- fill missing perimeter/area values
16+
- fix invalid geometries
17+
- rename columns
1218
- Converter for Switzerland
1319

1420
## [v0.8.0] - 2024-11-12

fiboa_cli/__init__.py

+73-7
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,18 @@
66
import click
77
import pandas as pd
88

9+
from .const import COMPRESSION_METHODS, CORE_COLUMNS
910
from .convert import convert as convert_
1011
from .convert import list_all_converter_ids, list_all_converters
1112
from .create_geojson import create_geojson as create_geojson_
1213
from .create_geoparquet import create_geoparquet as create_geoparquet_
1314
from .describe import describe as describe_
14-
from .merge import merge as merge_, DEFAULT_COLUMNS, DEFAULT_CRS
15+
from .improve import improve as improve_
16+
from .merge import merge as merge_, DEFAULT_CRS
1517
from .jsonschema import jsonschema as jsonschema_
1618
from .rename_extension import rename_extension as rename_extension_
1719
from .util import (check_ext_schema_for_cli, log, parse_converter_input_files,
18-
valid_file_for_cli, valid_file_for_cli_with_ext,
20+
parse_map, valid_file_for_cli, valid_file_for_cli_with_ext,
1921
valid_files_folders_for_cli, valid_folder_for_cli)
2022
from .validate import validate as validate_
2123
from .validate_schema import validate_schema as validate_schema_
@@ -376,7 +378,7 @@ def jsonschema(schema, out, fiboa_version, id_):
376378
)
377379
@click.option(
378380
'--compression', '-pc',
379-
type=click.Choice(["brotli", "gzip", "lz4", "snappy", "zstd", "none"]),
381+
type=click.Choice(COMPRESSION_METHODS),
380382
help='Compression method for the Parquet file.',
381383
show_default=True,
382384
default="brotli"
@@ -385,7 +387,7 @@ def jsonschema(schema, out, fiboa_version, id_):
385387
'--geoparquet1', '-gp1',
386388
is_flag=True,
387389
type=click.BOOL,
388-
help='Enforces generating a GeoParquet 1.0 file bounding box. Defaults to GeoParquet 1.1 with bounding box.',
390+
help='Enforces generating a GeoParquet 1.0 file. Defaults to GeoParquet 1.1 with bounding box.',
389391
default=False
390392
)
391393
@click.option(
@@ -518,7 +520,7 @@ def rename_extension(folder, title, slug, org = "fiboa", prefix = None):
518520
multiple=True,
519521
help='Additional column names to include.',
520522
show_default=True,
521-
default=DEFAULT_COLUMNS,
523+
default=CORE_COLUMNS,
522524
)
523525
@click.option(
524526
'--exclude', '-e',
@@ -536,7 +538,7 @@ def rename_extension(folder, title, slug, org = "fiboa", prefix = None):
536538
)
537539
@click.option(
538540
'--compression', '-pc',
539-
type=click.Choice(["brotli", "gzip", "lz4", "snappy", "zstd", "none"]),
541+
type=click.Choice(COMPRESSION_METHODS),
540542
help='Compression method for the Parquet file.',
541543
show_default=True,
542544
default="brotli"
@@ -545,7 +547,7 @@ def rename_extension(folder, title, slug, org = "fiboa", prefix = None):
545547
'--geoparquet1', '-gp1',
546548
is_flag=True,
547549
type=click.BOOL,
548-
help='Enforces generating a GeoParquet 1.0 file bounding box. Defaults to GeoParquet 1.1 with bounding box.',
550+
help='Enforces generating a GeoParquet 1.0 file. Defaults to GeoParquet 1.1 with bounding box.',
549551
default=False
550552
)
551553
def merge(datasets, out, crs, include, exclude, extension, compression, geoparquet1):
@@ -564,6 +566,69 @@ def merge(datasets, out, crs, include, exclude, extension, compression, geoparqu
564566
sys.exit(1)
565567

566568

569+
## IMPROVE (add area, perimeter, and fix geometries)
570+
@click.command()
571+
@click.argument('input', nargs=1, type=click.Path(exists=True))
572+
@click.option(
573+
'--out', '-o',
574+
type=click.Path(exists=False),
575+
help='Path to write the GeoParquet file to. If not given, overwrites the input file.',
576+
default=None
577+
)
578+
@click.option(
579+
'--rename-column', '-r',
580+
type=click.STRING,
581+
callback=lambda ctx, param, value: parse_map(value),
582+
multiple=True,
583+
help='Renaming of columns. Provide the old name and the new name separated by an equal sign. Can be used multiple times.'
584+
)
585+
@click.option(
586+
'--add-sizes', '-sz',
587+
is_flag=True,
588+
type=click.BOOL,
589+
help='Computes missing sizes (area, perimeter)',
590+
default=False
591+
)
592+
@click.option(
593+
'--fix-geometries', '-g',
594+
is_flag=True,
595+
type=click.BOOL,
596+
help='Tries to fix invalid geometries that are repored by the validator (uses shapely\'s make_valid method internally)',
597+
default=False
598+
)
599+
@click.option(
600+
'--crs',
601+
type=click.STRING,
602+
help='Coordinate Reference System (CRS) to use for the GeoParquet file.',
603+
show_default=True,
604+
default=DEFAULT_CRS,
605+
)
606+
@click.option(
607+
'--compression', '-pc',
608+
type=click.Choice(COMPRESSION_METHODS),
609+
help='Compression method for the Parquet file.',
610+
show_default=True,
611+
default="brotli"
612+
)
613+
@click.option(
614+
'--geoparquet1', '-gp1',
615+
is_flag=True,
616+
type=click.BOOL,
617+
help='Enforces generating a GeoParquet 1.0 file. Defaults to GeoParquet 1.1 with bounding box.',
618+
default=False
619+
)
620+
def improve(input, out, rename_column, add_sizes, fix_geometries, crs, compression, geoparquet1):
621+
"""
622+
"Improves" a fiboa GeoParquet file according to the given parameters.
623+
"""
624+
log(f"fiboa CLI {__version__} - Improve datasets\n", "success")
625+
try:
626+
improve_(input, out, rename_column, add_sizes, fix_geometries, crs, compression, geoparquet1)
627+
except Exception as e:
628+
log(e, "error")
629+
sys.exit(1)
630+
631+
567632
cli.add_command(describe)
568633
cli.add_command(validate)
569634
cli.add_command(validate_schema)
@@ -574,6 +639,7 @@ def merge(datasets, out, crs, include, exclude, extension, compression, geoparqu
574639
cli.add_command(converters)
575640
cli.add_command(rename_extension)
576641
cli.add_command(merge)
642+
cli.add_command(improve)
577643

578644
if __name__ == '__main__':
579645
cli()

fiboa_cli/const.py

+11
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,14 @@
1010
STAC_COLLECTION_SCHEMA = "http://schemas.stacspec.org/v{version}/collection-spec/json-schema/collection.json"
1111
GEOPARQUET_SCHEMA = "https://geoparquet.org/releases/v{version}/schema.json"
1212
STAC_TABLE_EXTENSION = "https://stac-extensions.github.io/table/v1.2.0/schema.json"
13+
14+
COMPRESSION_METHODS = ["brotli", "gzip", "lz4", "snappy", "zstd", "none"]
15+
16+
CORE_COLUMNS = [
17+
"id",
18+
"geometry",
19+
"area",
20+
"perimeter",
21+
"determination_datetime",
22+
"determination_method",
23+
]

fiboa_cli/convert.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def convert(
1313
collection = False,
1414
compression = None,
1515
geoparquet1 = False,
16-
mapping_file=None,
16+
mapping_file = None,
1717
):
1818
if dataset in IGNORED_DATASET_FILES:
1919
raise Exception(f"'{dataset}' is not a converter")

fiboa_cli/improve.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
3+
from pyproj import CRS
4+
5+
from .const import CORE_COLUMNS
6+
from .parquet import create_parquet
7+
from .util import load_parquet_data, load_parquet_schema, log, parse_metadata
8+
9+
10+
def improve(input, out = None, rename_columns = {}, add_sizes = False, fix_geometries = False, crs = None, compression = None, geoparquet1 = False):
11+
# Prepare and determine location of the output file
12+
if not out:
13+
out = input
14+
else:
15+
dir = os.path.dirname(out)
16+
if dir:
17+
os.makedirs(dir, exist_ok=True)
18+
19+
# Load the dataset
20+
schema = load_parquet_schema(input)
21+
collection = parse_metadata(schema, b"fiboa")
22+
columns = list(schema.names)
23+
# Remove the bbox column to avoid conflicts when writing GeoParquet file later
24+
columns.remove("bbox")
25+
gdf = load_parquet_data(input, columns = columns)
26+
27+
# Change the CRS
28+
if crs is not None:
29+
gdf.to_crs(crs=crs, inplace=True)
30+
log(f"Changed CRS to {crs}", "info")
31+
32+
# Fix geometries
33+
if fix_geometries:
34+
gdf.geometry = gdf.geometry.make_valid()
35+
log("Fixed geometries", "info")
36+
37+
# Rename columns
38+
if len(rename_columns) > 0:
39+
for col in rename_columns:
40+
columns[columns.index(col)] = rename_columns[col]
41+
if col in CORE_COLUMNS:
42+
log(f"Column {col} is a fiboa core field - do you really want to rename it?", "warning")
43+
if ":" in col:
44+
log(f"Column {col} may be a fiboa extension field - do you really want to rename it?", "warning")
45+
gdf.rename(columns=rename_columns, inplace=True)
46+
log("Renamed columns", "info")
47+
48+
# Add sizes
49+
if add_sizes:
50+
# Add the area and perimeter columns
51+
for name in ["area", "perimeter"]:
52+
if name not in columns:
53+
# Create column if not present
54+
gdf[name] = None
55+
columns.append(name)
56+
57+
gdf_m = gdf
58+
# Determine whether the given CRS is in meters
59+
if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]:
60+
# Reproject the geometries to an equal-area projection if needed
61+
gdf_m = gdf.to_crs("EPSG:6933")
62+
63+
# Compute the missing area and perimeter values
64+
gdf["area"] = gdf_m["area"].fillna(gdf_m.geometry.area * 0.0001)
65+
gdf["perimeter"] = gdf_m["perimeter"].fillna(gdf_m.geometry.length)
66+
67+
# Write the merged dataset to the output file
68+
create_parquet(gdf, columns, collection, out, {}, compression=compression, geoparquet1=geoparquet1)
69+
log(f"Wrote data to {out}", "success")

fiboa_cli/merge.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,19 @@
22

33
import pandas as pd
44

5+
from .const import CORE_COLUMNS
56
from .parquet import create_parquet
67
from .util import load_parquet_data, load_parquet_schema, log, parse_metadata
78
from .version import fiboa_version
89

9-
DEFAULT_COLUMNS = [
10-
"id",
11-
"geometry",
12-
"area",
13-
"perimeter",
14-
"determination_datetime",
15-
"determination_method",
16-
]
1710
DEFAULT_CRS = "EPSG:4326"
1811

1912
def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extensions = [], compression = None, geoparquet1 = False):
2013
dir = os.path.dirname(out)
2114
if dir:
2215
os.makedirs(dir, exist_ok=True)
2316

24-
columns = DEFAULT_COLUMNS.copy()
17+
columns = CORE_COLUMNS.copy()
2518
columns.extend(includes)
2619
columns = list(set(columns) - set(excludes))
2720

fiboa_cli/util.py

+16
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,22 @@ def parse_converter_input_files(ctx, param, value):
224224
return sources
225225

226226

227+
def parse_map(value, separator = "="):
228+
if value is None:
229+
return {}
230+
elif not isinstance(value, tuple):
231+
raise click.BadParameter('Input files must be a tuple')
232+
elif len(value) == 0:
233+
return {}
234+
235+
mapping = {}
236+
for v in value:
237+
key, value = v.split(separator, 2)
238+
mapping[key] = value
239+
240+
return mapping
241+
242+
227243
def name_from_uri(url):
228244
if "://" in url:
229245
try:

0 commit comments

Comments
 (0)