Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: keep Item created dates when resupplying TDE-1298 #1145

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/format-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,28 @@ jobs:

- name: End to end test - Aerial Imagery
run: |
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/aerial.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 10 --create-footprints=true
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/aerial.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 10 --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
cmp --silent "${{ runner.temp }}/BG35_1000_4829.tiff" ./scripts/tests/data/output/BG35_1000_4829.tiff

- name: End to end test - Elevation
run: |
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/dem.json --preset dem_lerc --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 30 --create-footprints=true
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/dem.json --preset dem_lerc --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 30 --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
cmp --silent "${{ runner.temp }}/BK39_10000_0102.tiff" ./scripts/tests/data/output/BK39_10000_0102.tiff
cmp --silent "${{ runner.temp }}/BK39_10000_0101.tiff" ./scripts/tests/data/output/BK39_10000_0101.tiff

- name: End to end test - Historical Aerial Imagery
run: |
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/hi.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 60 --create-footprints=true
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/hi.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 60 --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
cmp --silent "${{ runner.temp }}/BQ31_5000_0608.tiff" ./scripts/tests/data/output/BQ31_5000_0608.tiff

- name: End to end test - Cutline (Aerial Imagery)
run: |
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/aerial.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/cutline/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --cutline ./tests/data/cutline_aerial.fgb --gsd 10 --create-footprints=true
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/aerial.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/cutline/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --cutline ./tests/data/cutline_aerial.fgb --gsd 10 --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
cmp --silent "${{ runner.temp }}/cutline/BG35_1000_4829.tiff" ./scripts/tests/data/output/BG35_1000_4829_cut.tiff

- name: End to end test - Footprint
run: |
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/aerial.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 10m --create-footprints=true
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/aerial.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 10m --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
jq 'select(.xy_coordinate_resolution == 1E-8) // error("Wrong or missing X/Y coordinate resolution")' "${{ runner.temp }}/BG35_1000_4829_footprint.geojson"
cmp --silent <(jq "del(.features[0].properties.location, .xy_coordinate_resolution)" "${{ runner.temp }}/BG35_1000_4829_footprint.geojson") <(jq "del(.features[0].properties.location, .xy_coordinate_resolution)" ./scripts/tests/data/output/BG35_1000_4829_footprint.geojson)

Expand All @@ -64,7 +64,7 @@ jobs:

- name: End to end test - Restandardise Aerial Imagery
run: |
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/restandardise.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/restandardise/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 10 --create-footprints=true
docker run -v "${{ runner.temp }}:/tmp/" topo-imagery python3 standardise_validate.py --from-file ./tests/data/restandardise.json --preset webp --target-epsg 2193 --source-epsg 2193 --target /tmp/restandardise/ --collection-id 123 --start-datetime 2023-01-01 --end-datetime 2023-01-01 --gsd 10 --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
cmp --silent "${{ runner.temp }}/restandardise/BG35_1000_4829.tiff" ./scripts/tests/data/output/BG35_1000_4829.tiff

- name: End to end test - Translate Ascii Files (Elevation)
Expand All @@ -74,7 +74,7 @@ jobs:

- name: End to end test - Remove empty files
run: |
docker run -v "${{ runner.temp }}/tmp-empty/:/tmp/" topo-imagery python3 standardise_validate.py --from-file=./tests/data/empty.json --preset=webp --target-epsg=2193 --source-epsg=2193 --target=/tmp --collection-id=123 --start-datetime=2023-01-01 --end-datetime=2023-01-01 --gsd 60 --create-footprints=true
docker run -v "${{ runner.temp }}/tmp-empty/:/tmp/" topo-imagery python3 standardise_validate.py --from-file=./tests/data/empty.json --preset=webp --target-epsg=2193 --source-epsg=2193 --target=/tmp --collection-id=123 --start-datetime=2023-01-01 --end-datetime=2023-01-01 --gsd 60 --create-footprints=true --current-datetime=2020-01-02T03:04:05Z
empty_target_directory="$(find "${{ runner.temp }}/tmp-empty" -maxdepth 0 -type d -empty)"
[[ -n "$empty_target_directory" ]]

Expand Down
15 changes: 0 additions & 15 deletions scripts/files/fs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os
from concurrent.futures import Future, ThreadPoolExecutor
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING

from boto3 import resource
from linz_logger import get_log
Expand All @@ -11,11 +8,6 @@
from scripts.files import fs_local, fs_s3
from scripts.stac.util.checksum import multihash_as_hex

if TYPE_CHECKING:
from mypy_boto3_s3 import S3Client
else:
S3Client = dict


def write(destination: str, source: bytes, content_type: str | None = None) -> str:
"""Write a file from its source to a destination path.
Expand Down Expand Up @@ -87,13 +79,6 @@ def exists(path: str) -> bool:
return fs_local.exists(path)


def modified(path: str, s3_client: S3Client | None = None) -> datetime:
"""Get modified datetime for S3 URL or local path"""
if is_s3(path):
return fs_s3.modified(fs_s3.bucket_name_from_path(path), fs_s3.prefix_from_path(path), s3_client)
return fs_local.modified(Path(path))


def write_all(inputs: list[str], target: str, concurrency: int | None = 4, generate_name: bool | None = True) -> list[str]:
"""Writes list of files to target destination using multithreading.
Args:
Expand Down
8 changes: 0 additions & 8 deletions scripts/files/fs_local.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os
from datetime import datetime, timezone
from pathlib import Path


def write(destination: str, source: bytes) -> None:
Expand Down Expand Up @@ -38,9 +36,3 @@ def exists(path: str) -> bool:
True if the path exists
"""
return os.path.exists(path)


def modified(path: Path) -> datetime:
"""Get path modified datetime as UTC"""
modified_timestamp = os.path.getmtime(path)
return datetime.fromtimestamp(modified_timestamp, tz=timezone.utc)
6 changes: 0 additions & 6 deletions scripts/files/fs_s3.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from collections.abc import Generator
from concurrent import futures
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import TYPE_CHECKING, Any

from boto3 import client, resource
Expand Down Expand Up @@ -241,8 +240,3 @@ def get_object_parallel_multithreading(
yield key, future.result()
else:
yield key, exception


def modified(bucket_name: str, key: str, s3_client: S3Client | None) -> datetime:
s3_client = s3_client or client("s3")
return _get_object(bucket_name, key, s3_client)["LastModified"]
12 changes: 1 addition & 11 deletions scripts/files/tests/fs_local_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import os
from pathlib import Path

import pytest

from scripts.files.fs_local import exists, modified, read, write
from scripts.tests.datetimes_test import any_epoch_datetime
from scripts.files.fs_local import exists, read, write


@pytest.mark.dependency(name="write")
Expand Down Expand Up @@ -45,11 +43,3 @@ def test_exists(setup: str) -> None:
def test_exists_file_not_found() -> None:
found = exists("/tmp/test.file")
assert found is False


def test_should_get_modified_datetime(setup: str) -> None:
path = Path(os.path.join(setup, "modified.file"))
path.touch()
modified_datetime = any_epoch_datetime()
os.utime(path, times=(any_epoch_datetime().timestamp(), modified_datetime.timestamp()))
assert modified(path) == modified_datetime
21 changes: 1 addition & 20 deletions scripts/files/tests/fs_s3_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,12 @@
from boto3 import client, resource
from botocore.exceptions import ClientError
from moto import mock_aws
from moto.core.models import DEFAULT_ACCOUNT_ID
from moto.s3.models import s3_backends
from moto.s3.responses import DEFAULT_REGION_NAME
from moto.wafv2.models import GLOBAL_REGION
from mypy_boto3_s3 import S3Client
from pytest import CaptureFixture, raises
from pytest_subtests import SubTests

from scripts.files.files_helper import ContentType
from scripts.files.fs_s3 import exists, list_files_in_uri, modified, read, write
from scripts.tests.datetimes_test import any_epoch_datetime
from scripts.files.fs_s3 import exists, list_files_in_uri, read, write


@mock_aws
Expand Down Expand Up @@ -174,17 +169,3 @@ def test_list_files_in_uri(subtests: SubTests) -> None:

with subtests.test():
assert "data/image.tiff" not in files


@mock_aws
def test_should_get_modified_datetime() -> None:
bucket_name = "any-bucket-name"
key = "any-key"
modified_datetime = any_epoch_datetime()

s3_client: S3Client = client("s3", region_name=DEFAULT_REGION_NAME)
s3_client.create_bucket(Bucket=bucket_name)
s3_client.put_object(Bucket=bucket_name, Key=key, Body=b"any body")
s3_backends[DEFAULT_ACCOUNT_ID][GLOBAL_REGION].buckets[bucket_name].keys[key].last_modified = modified_datetime

assert modified(bucket_name, key, s3_client) == modified_datetime
31 changes: 2 additions & 29 deletions scripts/files/tests/fs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@
from shutil import rmtree
from tempfile import mkdtemp

from boto3 import client, resource
from boto3 import resource
from moto import mock_aws
from moto.core.models import DEFAULT_ACCOUNT_ID
from moto.s3.models import s3_backends
from moto.s3.responses import DEFAULT_REGION_NAME
from moto.wafv2.models import GLOBAL_REGION
from mypy_boto3_s3 import S3Client
from pytest import CaptureFixture, raises
from pytest_subtests import SubTests

from scripts.files.fs import NoSuchFileError, modified, read, write, write_all, write_sidecars
from scripts.tests.datetimes_test import any_epoch_datetime
from scripts.files.fs import NoSuchFileError, read, write, write_all, write_sidecars


def test_read_key_not_found_local() -> None:
Expand Down Expand Up @@ -103,25 +98,3 @@ def test_write_all_in_order(setup: str) -> None:
i += 1
written_files = write_all(inputs=inputs, target=setup, generate_name=False)
assert written_files == inputs


@mock_aws
def test_should_get_s3_object_modified_datetime() -> None:
bucket_name = "any-bucket-name"
key = "any-key"
modified_datetime = any_epoch_datetime()

s3_client: S3Client = client("s3", region_name=DEFAULT_REGION_NAME)
s3_client.create_bucket(Bucket=bucket_name)
s3_client.put_object(Bucket=bucket_name, Key=key, Body=b"any body")
s3_backends[DEFAULT_ACCOUNT_ID][GLOBAL_REGION].buckets[bucket_name].keys[key].last_modified = modified_datetime

assert modified(f"s3://{bucket_name}/{key}", s3_client) == modified_datetime


def test_should_get_local_file_modified_datetime(setup: str) -> None:
path = os.path.join(setup, "modified.file")
Path(path).touch()
modified_datetime = any_epoch_datetime()
os.utime(path, times=(any_epoch_datetime().timestamp(), modified_datetime.timestamp()))
assert modified(path) == modified_datetime
87 changes: 57 additions & 30 deletions scripts/stac/imagery/create_stac.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import json
import os
from os import environ, path
from typing import Any

from linz_logger import get_log
from shapely.geometry.base import BaseGeometry

from scripts.datetimes import format_rfc_3339_datetime_string, utc_now
from scripts.datetimes import utc_now
from scripts.files import fs
from scripts.files.files_helper import get_file_name_from_path
from scripts.files.fs import modified, read
from scripts.files.fs import NoSuchFileError, read
from scripts.files.geotiff import get_extents
from scripts.gdal.gdal_helper import gdal_info
from scripts.gdal.gdalinfo import GdalInfo
Expand All @@ -17,7 +17,7 @@
from scripts.stac.imagery.metadata_constants import CollectionMetadata
from scripts.stac.imagery.provider import Provider, ProviderRole
from scripts.stac.link import Link, Relation
from scripts.stac.util import checksum
from scripts.stac.util.checksum import multihash_as_hex
from scripts.stac.util.media_type import StacMediaType


Expand Down Expand Up @@ -85,8 +85,10 @@ def create_item(
end_datetime: str,
collection_id: str,
gdal_version: str,
current_datetime: str,
gdalinfo_result: GdalInfo | None = None,
derived_from: list[str] | None = None,
published_path: str | None = None,
) -> ImageryItem:
"""Create an ImageryItem (STAC) to be linked to a Collection.

Expand All @@ -96,18 +98,15 @@ def create_item(
end_datetime: end date of the survey
collection_id: collection id to link to the Item
gdal_version: GDAL version
current_datetime: datetime string that represents the current time when the item is created
gdalinfo_result: result of the gdalinfo command. Defaults to None.
derived_from: list of STAC Items from where this Item is derived. Defaults to None.
published_path: path of the published dataset. Defaults to None.

Returns:
a STAC Item wrapped in ImageryItem
"""
item = create_base_item(asset_path, gdal_version)

if not gdalinfo_result:
gdalinfo_result = gdal_info(asset_path)

geometry, bbox = get_extents(gdalinfo_result)
item = create_base_item(asset_path, gdal_version, current_datetime, published_path)

if derived_from is not None:
for derived in derived_from:
Expand All @@ -127,48 +126,76 @@ def create_item(
)

item.update_datetime(start_datetime, end_datetime)
item.update_spatial(geometry, bbox)
item.update_spatial(*get_extents(gdalinfo_result or gdal_info(asset_path)))
item.add_collection(collection_id)

get_log().info("ImageryItem created", path=asset_path)
return item


def create_base_item(asset_path: str, gdal_version: str) -> ImageryItem:
def create_base_item(asset_path: str, gdal_version: str, current_datetime: str, published_path: str | None) -> ImageryItem:
"""
Args:
asset_path: path of the visual asset (TIFF)
gdal_version: GDAL version string
current_datetime: datetime string that represents the current time when the item is created
published_path: path of the published dataset

Returns:
An ImageryItem with basic information.
"""
id_ = get_file_name_from_path(asset_path)
file_content = fs.read(asset_path)
file_modified_datetime = format_rfc_3339_datetime_string(modified(asset_path))

stac_asset = STACAsset(
**{
"href": os.path.join(".", os.path.basename(asset_path)),
"file:checksum": checksum.multihash_as_hex(file_content),
"created": file_modified_datetime,
"updated": file_modified_datetime,
}
)

now_string = format_rfc_3339_datetime_string(utc_now())

if (topo_imagery_hash := os.environ.get("GIT_HASH")) is not None:
if (topo_imagery_hash := environ.get("GIT_HASH")) is not None:
commit_url = f"https://github.com/linz/topo-imagery/commit/{topo_imagery_hash}"
else:
commit_url = "GIT_HASH not specified"

stac_processing = STACProcessing(
**{
"processing:datetime": now_string,
"processing:datetime": current_datetime,
"processing:software": STACProcessingSoftware(**{"gdal": gdal_version, "linz/topo-imagery": commit_url}),
"processing:version": os.environ.get("GIT_VERSION", "GIT_VERSION not specified"),
"processing:version": environ.get("GIT_VERSION", "GIT_VERSION not specified"),
}
)

id_ = get_file_name_from_path(asset_path)
file_content = fs.read(asset_path)
created_datetime = updated_datetime = current_datetime

if published_path:
try:
existing_item_content = read(path.join(published_path, f"{id_}.json"))
existing_item = json.loads(existing_item_content.decode("UTF-8"))
try:
created_datetime = existing_item["properties"]["created"]
except KeyError:
get_log().info(f"Existing Item {id_} does not have 'properties.created' attribute")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case we should not set the created datetime at all - it needs to be backfilled.


try:
if multihash_as_hex(file_content) == existing_item["assets"]["visual"]["file:checksum"]:
# Keep existing created time and processing properties
created_datetime = existing_item["assets"]["visual"]["created"]
updated_datetime = existing_item["assets"]["visual"]["updated"]
stac_processing = STACProcessing(
**{
"processing:datetime": existing_item["properties"]["processing:datetime"],
"processing:software": existing_item["properties"]["processing:software"],
"processing:version": existing_item["properties"]["processing:version"],
}
)
except KeyError:
get_log().info(f"Existing Item for {id_} does not have 'assets.visual' attributes")

except NoSuchFileError:
get_log().info(f"No Item is published for ID: {id_}")

stac_asset = STACAsset(
**{
"href": path.join(".", path.basename(asset_path)),
"file:checksum": multihash_as_hex(file_content),
"created": created_datetime,
"updated": updated_datetime,
}
)

return ImageryItem(id_, now_string, stac_asset, stac_processing)
return ImageryItem(id_, stac_asset, stac_processing)
Loading
Loading