Skip to content

Commit

Permalink
feat: add capture-area.geojson to the STAC Collection TDE-965 (#758)
Browse files Browse the repository at this point in the history
* feat: add capture-area.geojson to the STAC Collection TDE-965

* fix: removed code has been accidently brought back

* fix: formatting fails

* refactor: simplify gdal_footprint arguments

* feat: save capture-area.geojson checksum

* refactor: move add capture area logic to collection.py

* docs: grammar

Co-authored-by: Victor Engmark <[email protected]>

* test: improve capture area UT

* fix: unused 'type:ignore' comments

* build: fix poetry did update every libs when adding a new one

* build: poetry lock with geojson

* revert: "fix: unused 'type:ignore' comments"

This reverts commit 9707c62.

* fix: use list instead of tuple for file suffixes

* test: add a test for list_files_in_uri

* refactor: store suffixes in const

* feat: save capture-area.json file size in the collection.json

* fix: allow margin of error to the polygon geometry

* feat: create temporary script to generate capture-area from existing published dataset

* refactor: remove test script

* fix: enum value

* feat: round the geometry using the gsd

* fix: typo

* docs: comment for gsd

* fix: typo file name

* fix: typo file name

* refactor: simplify comparison

Co-authored-by: Victor Engmark <[email protected]>

* refactor: simplify code in tests

* refactor: rename variable for readability

* refactor: avoid temporary capture-area file

* refactor: remove condition

Co-authored-by: Victor Engmark <[email protected]>

* refactor: store EPSG codes in constants

* test: replace assert not by assert

* test: simplify test polygons

* refactor: re-use variable

Co-authored-by: Victor Engmark <[email protected]>

* docs: language

Co-authored-by: Victor Engmark <[email protected]>

* docs: move comment to docstring

---------

Co-authored-by: Victor Engmark <[email protected]>
  • Loading branch information
paulfouquet and l0b0 authored Feb 14, 2024
1 parent 83c9a68 commit 75df081
Show file tree
Hide file tree
Showing 17 changed files with 393 additions and 42 deletions.
46 changes: 29 additions & 17 deletions scripts/collection_from_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,21 @@
import os
from typing import List

import shapely.geometry
import shapely.ops
from boto3 import client
from linz_logger import get_log

from scripts.cli.cli_helper import coalesce_multi_single, valid_date
from scripts.files.fs_s3 import bucket_name_from_path, get_object_parallel_multithreading, list_json_in_uri
from scripts.files.files_helper import SUFFIX_FOOTPRINT, SUFFIX_JSON
from scripts.files.fs_s3 import bucket_name_from_path, get_object_parallel_multithreading, list_files_in_uri
from scripts.logging.time_helper import time_in_ms
from scripts.stac.imagery.collection import ImageryCollection
from scripts.stac.imagery.metadata_constants import DATA_CATEGORIES, HUMAN_READABLE_REGIONS, CollectionMetadata
from scripts.stac.imagery.provider import Provider, ProviderRole


# pylint: disable-msg=too-many-locals
# pylint: disable=too-many-locals
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--uri", dest="uri", help="s3 path to items and collection.json write location", required=True)
Expand Down Expand Up @@ -115,28 +118,37 @@ def main() -> None:

s3_client = client("s3")

files_to_read = list_json_in_uri(uri, s3_client)
files_to_read = list_files_in_uri(uri, [SUFFIX_JSON, SUFFIX_FOOTPRINT], s3_client)

start_time = time_in_ms()
polygons = []
for key, result in get_object_parallel_multithreading(
bucket_name_from_path(uri), files_to_read, s3_client, arguments.concurrency
):
item_stac = json.loads(result["Body"].read().decode("utf-8"))

if not arguments.collection_id == item_stac.get("collection"):
get_log().trace(
"skipping: item.collection != collection.id",
file=key,
action="collection_from_items",
reason="skip",
)
continue

collection.add_item(item_stac)
get_log().info("item added to collection", item=item_stac["id"], file=key)
content = json.load(result["Body"])
# The following if/else looks like it could be avoid by refactoring `list_files_in_uri()`
# to return a result list per suffix, but we would have to call `get_object_parallel_multithreading()`
# for each of them to avoid this if/else.
if key.endswith(SUFFIX_JSON):
if arguments.collection_id != content.get("collection"):
get_log().trace(
"skipping: item.collection != collection.id",
file=key,
action="collection_from_items",
reason="skip",
)
continue
collection.add_item(content)
get_log().info("item added to collection", item=content["id"], file=key)
elif key.endswith(SUFFIX_FOOTPRINT):
get_log().debug(f"adding geometry from {key}")
polygons.append(shapely.geometry.shape(content["features"][0]["geometry"]))

if polygons:
collection.add_capture_area(polygons, uri)

get_log().info(
"Matching items added to collection",
"Matching items added to collection and capture-area created",
item_count=len(files_to_read),
item_match_count=[dictionary["rel"] for dictionary in collection.stac["links"]].count("item"),
duration=time_in_ms() - start_time,
Expand Down
3 changes: 3 additions & 0 deletions scripts/files/files_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
from enum import Enum

SUFFIX_JSON = ".json"
SUFFIX_FOOTPRINT = "_footprint.geojson"


class ContentType(str, Enum):
GEOTIFF = "image/tiff; application=geotiff; profile=cloud-optimized"
Expand Down
10 changes: 5 additions & 5 deletions scripts/files/fs_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from linz_logger import get_log

from scripts.aws.aws_helper import get_session, parse_path
from scripts.files.files_helper import is_json
from scripts.logging.time_helper import time_in_ms


Expand Down Expand Up @@ -164,15 +163,16 @@ def prefix_from_path(path: str) -> str:
return path.replace(f"s3://{bucket_name}/", "")


def list_json_in_uri(uri: str, s3_client: Optional[client]) -> List[str]:
"""Get the `JSON` files from a s3 path
def list_files_in_uri(uri: str, suffixes: List[str], s3_client: Optional[client]) -> List[str]:
"""Get a list of file paths from a s3 path based on their suffixes
Args:
uri: an s3 path
suffixes: a a list of suffixes. example: [".json", "_meta.xml"]
s3_client: an s3 client
Returns:
a list of JSON files
a list of file paths
"""
if not s3_client:
s3_client = client("s3")
Expand All @@ -183,7 +183,7 @@ def list_json_in_uri(uri: str, s3_client: Optional[client]) -> List[str]:
for response in response_iterator:
for contents_data in response["Contents"]:
key = contents_data["Key"]
if not is_json(key):
if not key.lower().endswith(tuple(suffixes)):
get_log().trace("skipping file not json", file=key, action="collection_from_items", reason="skip")
continue
files.append(key)
Expand Down
19 changes: 18 additions & 1 deletion scripts/files/tests/fs_s3_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pytest import CaptureFixture, raises

from scripts.files.files_helper import ContentType
from scripts.files.fs_s3 import exists, read, write
from scripts.files.fs_s3 import exists, list_files_in_uri, read, write


@mock_s3 # type: ignore
Expand Down Expand Up @@ -124,3 +124,20 @@ def test_exists_object_starting_with_not_exists() -> None:
file_exists = exists("s3://testbucket/hello/another.fi")

assert file_exists is False


@mock_s3 # type: ignore
def test_list_files_in_uri() -> None:
bucket_name = "testbucket"
s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
boto3_client = client("s3", region_name=DEFAULT_REGION_NAME)
s3.create_bucket(Bucket=bucket_name)
boto3_client.put_object(Bucket=bucket_name, Key="data/collection.json", Body=b"")
boto3_client.put_object(Bucket=bucket_name, Key="data/image.tiff", Body=b"")
boto3_client.put_object(Bucket=bucket_name, Key="data/image_meta.xml", Body=b"")

files = list_files_in_uri(f"s3://{bucket_name}/data/", [".json", "_meta.xml"], boto3_client)

assert len(files) == 2
assert set(files) == {"data/collection.json", "data/image_meta.xml"}
assert "data/image.tiff" not in files
10 changes: 9 additions & 1 deletion scripts/gdal/gdal_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import subprocess
from enum import Enum
from shutil import rmtree
from tempfile import mkdtemp
from typing import List, Optional, cast
Expand All @@ -18,6 +19,13 @@ class GDALExecutionException(Exception):
pass


class EpsgCode(str, Enum):
EPSG_2193 = "EPSG:2193"
""" NZGD2000 / New Zealand Transverse Mercator 2000 (NZTM) """
EPSG_4326 = "EPSG:4326"
""" WGS84 - World Geodetic System 1984"""


def get_vfs_path(path: str) -> str:
"""Make the path as a GDAL Virtual File Systems path.
Expand Down Expand Up @@ -124,7 +132,7 @@ def get_srs() -> bytes:
Returns:
the output of `gdalsrsinfo`
"""
gdalsrsinfo_command = ["gdalsrsinfo", "-o", "wkt", "EPSG:2193"]
gdalsrsinfo_command = ["gdalsrsinfo", "-o", "wkt", EpsgCode.EPSG_2193]
gdalsrsinfo_result = run_gdal(gdalsrsinfo_command)
if gdalsrsinfo_result.stderr:
raise Exception(
Expand Down
70 changes: 70 additions & 0 deletions scripts/stac/imagery/capture_area.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import json
from typing import Any, Dict, List

from shapely import BufferCapStyle, BufferJoinStyle, Geometry, to_geojson, union_all
from shapely.geometry import Polygon

DECIMAL_DEGREES_1M = 0.00001
"""
Degree precision of ~1m (decimal places 5, https://en.wikipedia.org/wiki/Decimal_degrees)
"""


def to_feature(geometry: Geometry) -> Dict[str, Any]:
"""Transform a Geometry to a GeoJSON feature.
Args:
geometry: a Geometry
Returns:
a GeoJSON document.
"""
return {"geometry": json.loads(to_geojson(geometry)), "type": "Feature", "properties": {}}


def merge_polygons(polygons: List[Polygon], buffer_distance: float) -> Geometry:
"""Merge a list of polygons by converting them to a single geometry that covers the same area.
A buffer distance is used to buffer out the polygons before dissolving them together and then negative buffer them back in.
The merged geometry is simplify (rounded) to the decimal used for the buffer.
Args:
polygons: list of polygons to merge
buffer_distance: decimal places to use to buffer the polygons
Returns:
A single Geometry.
"""
buffered_polygons = []
for poly in polygons:
# Buffer each polygon to round up to the `buffer_distance`
buffered_poly = poly.buffer(buffer_distance, cap_style=BufferCapStyle.flat, join_style=BufferJoinStyle.mitre)
buffered_polygons.append(buffered_poly)
union_buffered = union_all(buffered_polygons)
# Negative buffer back in the polygons
union_unbuffered = union_buffered.buffer(-buffer_distance, cap_style=BufferCapStyle.flat, join_style=BufferJoinStyle.mitre)
union_simplified = union_unbuffered.simplify(buffer_distance)

return union_simplified


def generate_capture_area(polygons: List[Polygon], gsd: float) -> Dict[str, Any]:
"""Generate the capture area from a list of polygons.
Providing the `gsd` allows to round the geometry as we've seen some tiffs geometry being slightly off,
sometimes due to rounding issue in their creation process (before delivery).
If we don't apply this rounding, we could get a very small gaps between tiffs
which would result in a capture area having gaps.
The `gsd` (in meter) is multiplied by the 1m degree of precision.
Note that all the polygons are buffered which means a gap bigger than the gsd,
but < gsd*2) will be closed.
Args:
polygons: list of polygons of the area
gsd: Ground Sample Distance in meter
Returns:
The capture-area geojson document.
"""
buffer_distance = DECIMAL_DEGREES_1M * gsd
merged_polygons = merge_polygons(polygons, buffer_distance)

return to_feature(merged_polygons)
43 changes: 43 additions & 0 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import json
import os
from datetime import datetime
from typing import Any, Dict, List, Optional

import shapely.geometry
import shapely.ops
import ulid

from scripts.files.files_helper import ContentType
from scripts.files.fs import write
from scripts.stac.imagery.capture_area import generate_capture_area
from scripts.stac.imagery.metadata_constants import (
DATA_CATEGORIES,
DEM,
Expand All @@ -20,7 +24,11 @@
SubtypeParameterError,
)
from scripts.stac.imagery.provider import Provider, ProviderRole
from scripts.stac.util import checksum
from scripts.stac.util.STAC_VERSION import STAC_VERSION
from scripts.stac.util.stac_extensions import StacExtensions

CAPTURE_AREA_FILE_NAME = "capture-area.geojson"


class ImageryCollection:
Expand Down Expand Up @@ -75,6 +83,41 @@ def __init__(

self.add_providers(providers)

def add_capture_area(self, polygons: List[shapely.geometry.shape], target: str) -> None:
"""Add the capture area of the Collection.
The `href` or path of the capture-area.geojson is always set as the relative `./capture-area.geojson`
Args:
polygons: list of geometries
target: path of the capture-area-geojson file
"""

# The GSD is measured in meters (e.g., `0.3m`)
capture_area_document = generate_capture_area(polygons, float(self.metadata["gsd"].replace("m", "")))
capture_area_content: bytes = json.dumps(capture_area_document).encode("utf-8")
file_checksum = checksum.multihash_as_hex(capture_area_content)
capture_area = {
"href": f"./{CAPTURE_AREA_FILE_NAME}",
"title": "Capture area",
"type": ContentType.GEOJSON,
"roles": ["metadata"],
"file:checksum": file_checksum,
"file:size": len(capture_area_content),
}
self.stac.setdefault("assets", {})["capture_area"] = capture_area

# Save `capture-area.geojson` in target
write(
os.path.join(target, CAPTURE_AREA_FILE_NAME),
capture_area_content,
content_type=ContentType.GEOJSON.value,
)

self.stac["stac_extensions"] = self.stac.get("stac_extensions", [])

if StacExtensions.file.value not in self.stac["stac_extensions"]:
self.stac["stac_extensions"].append(StacExtensions.file.value)

def add_item(self, item: Dict[Any, Any]) -> None:
"""Add an `Item` to the `links` of the `Collection`.
Expand Down
4 changes: 3 additions & 1 deletion scripts/stac/imagery/item.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from typing import Any, Dict, Tuple

from scripts.files import fs
from scripts.stac.util import checksum
from scripts.stac.util.STAC_VERSION import STAC_VERSION
from scripts.stac.util.stac_extensions import StacExtensions
Expand All @@ -10,6 +11,7 @@ class ImageryItem:
stac: Dict[str, Any]

def __init__(self, id_: str, file: str) -> None:
file_content = fs.read(file)
self.stac = {
"type": "Feature",
"stac_version": STAC_VERSION,
Expand All @@ -21,7 +23,7 @@ def __init__(self, id_: str, file: str) -> None:
"visual": {
"href": os.path.join(".", os.path.basename(file)),
"type": "image/tiff; application=geotiff; profile=cloud-optimized",
"file:checksum": checksum.multihash_as_hex(file),
"file:checksum": checksum.multihash_as_hex(file_content),
}
},
"stac_extensions": [StacExtensions.file.value],
Expand Down
File renamed without changes.
Loading

0 comments on commit 75df081

Please sign in to comment.