Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ nanoid = "~=2.0.0"
whitenoise = "~=6.8.2"
pystac-client = "~=0.8.5"
ecs-logging = "*"
beautifulsoup4 = "*"
requests = "*"

[dev-packages]
yapf = "*"
Expand All @@ -40,6 +42,7 @@ bandit = "*"
pytest-xdist = "*"
types-nanoid = "*"
pytest-cov = "*"
types-requests = "*"

[requires]
python_version = "3.12"
37 changes: 36 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion app/distributions/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,7 @@ class PackageDistributionAdmin(admin.ModelAdmin): # type:ignore[type-arg]
'''Admin View for Package Distribution'''

list_display = ('package_distribution_id', 'managed_by_stac', 'dataset')
list_filter = ('managed_by_stac',)
list_filter = (
'managed_by_stac',
('dataset__provider', admin.RelatedOnlyFieldListFilter),
)
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from difflib import SequenceMatcher
from re import split
from typing import Any
from typing import Literal
from typing import TypedDict
from typing import cast
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from distributions.models import Dataset
from distributions.models import PackageDistribution
from pystac.collection import Collection
from pystac_client import Client
from requests import get
from utils.command import CommandHandler
from utils.command import CustomBaseCommand

Expand All @@ -26,6 +30,8 @@ def __init__(self, command: CustomBaseCommand, options: dict['str', Any]):
self.dry_run = options['dry_run']
self.similarity = options['similarity']
self.url = options['url']
self.endpoint = options['endpoint']
self.default_dataset = options['default_dataset']
self.counts: dict[str, Counter] = {}

def increment_counter(self, model_name: str, operation: Operation, value: int = 1) -> None:
Expand All @@ -37,11 +43,55 @@ def increment_counter(self, model_name: str, operation: Operation, value: int =
def clear_package_distributions(self) -> None:
""" Remove existing package distributions previously imported from STAC. """

_, cleared = PackageDistribution.objects.filter(managed_by_stac=True).delete()
_, cleared = PackageDistribution.objects.filter(_legacy_imported=True).delete()
for model_class, count in cleared.items():
model_name = model_class.split('.')[-1].lower()
self.increment_counter(model_name, 'cleared', count)

def update_package_distribution(
self, collection_id: str, managed_by_stac: bool
) -> Dataset | None:
""" Create or update the package distribution with the given ID. """
managed = 'managed' if managed_by_stac else 'unmanaged'

# Get dataset
dataset = (
Dataset.objects.filter(dataset_id=collection_id).first() or
Dataset.objects.filter(dataset_id=self.default_dataset).first()
)
if not dataset:
self.print_warning("No dataset for collection id '%s'", collection_id)
if self.default_dataset:
self.print_warning("Default dataset '%s' does not exist", self.default_dataset)
return None

# Get or create package distribution
package_distribution = PackageDistribution.objects.filter(
package_distribution_id=collection_id, _legacy_imported=True
).first()
if not package_distribution:
package_distribution = PackageDistribution.objects.create(
package_distribution_id=collection_id,
_legacy_imported=True,
managed_by_stac=managed_by_stac,
dataset=dataset
)
self.increment_counter('package_distribution', 'added')
self.print(f"Added package distribution '{collection_id}' ({managed})")

# Update package distribution
if (
package_distribution.managed_by_stac != managed_by_stac or
package_distribution.dataset != dataset
):
package_distribution.managed_by_stac = managed_by_stac
package_distribution.dataset = dataset
package_distribution.save()
self.increment_counter('package_distribution', 'updated')
self.print(f"Updated package distribution '{collection_id}' ({managed})")

return dataset

def import_package_distributions(self) -> None:
""" Import package distributions from STAC.

Expand All @@ -53,43 +103,47 @@ def import_package_distributions(self) -> None:
"""
processed = set()

# Get collections
client = Client.open(self.url)
# Get managed collections from STAC API
client = Client.open(urljoin(self.url, self.endpoint))
for collection in client.collection_search().collections():
collection_id = collection.id
processed.add(collection_id)

# Get dataset
dataset = Dataset.objects.filter(dataset_id=collection_id).first()
dataset = self.update_package_distribution(collection_id, True)
if not dataset:
self.print_warning("No dataset for collection id '%s'", collection_id)
continue

# Get or create package distribution
package_distribution = PackageDistribution.objects.filter(
package_distribution_id=collection_id
).first()
if not package_distribution:
package_distribution = PackageDistribution.objects.create(
package_distribution_id=collection_id, managed_by_stac=True, dataset=dataset
)
self.increment_counter('package_distribution', 'added')
self.print(f"Added package distribution '{collection_id}'")

# Update package distribution
if not package_distribution.managed_by_stac or package_distribution.dataset != dataset:
package_distribution.managed_by_stac = True
package_distribution.dataset = dataset
package_distribution.save()
self.increment_counter('package_distribution', 'updated')
self.print(f"Updated package distribution '{collection_id}'")

self.check_provider(collection, dataset)
processed.add(collection_id)

# Get unmanaged collections from the HTML root
response = get(self.url, timeout=60)
element = BeautifulSoup(response.text, 'html.parser').find('div', id="data")
if not element:
raise ValueError(f"Error parsing {self.url}")

for line in split(r'\r?\n', element.text.strip()):
line = line.strip()
if not line:
continue

values = line.split(' ')
if len(values) == 0:
continue

collection_id = values[0]
if collection_id in processed:
continue

dataset = self.update_package_distribution(collection_id, False)
if not dataset:
continue

processed.add(collection_id)

# Remove orphaned package distributions
orphans = PackageDistribution.objects.filter(managed_by_stac=True).exclude(
package_distribution_id__in=processed
)
orphans = PackageDistribution.objects.filter(
_legacy_imported=True
).exclude(package_distribution_id__in=processed,)
_, removed = orphans.delete()
for model_class, count in removed.items():
model_name = model_class.split('.')[-1].lower()
Expand Down Expand Up @@ -166,8 +220,13 @@ def add_arguments(self, parser: CommandParser) -> None:
default=1.0,
help="Similarity threshold to use when comparing providers"
)
parser.add_argument("--url", type=str, default="https://data.geo.admin.ch", help="STAC URL")
parser.add_argument("--endpoint", type=str, default="/api/stac/v1", help="STAC endpoint")
parser.add_argument(
"--url", type=str, default="https://data.geo.admin.ch/api/stac/v0.9", help="STAC URL"
"--default-dataset",
type=str,
default="",
help="Add packages with missing dataset to this dataset"
)

def handle(self, *args: Any, **options: Any) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 5.0.14 on 2025-04-10 13:40

from django.db import migrations
from django.db import models


class Migration(migrations.Migration):

dependencies = [
('distributions', '0012_dataset_geocat_id'),
]

operations = [
migrations.AddField(
model_name='packagedistribution',
name='_legacy_imported',
field=models.BooleanField(
default=False,
help_text='This field is used to track objects imported from STAC',
verbose_name='Legacy Imported'
),
),
migrations.AlterField(
model_name='packagedistribution',
name='managed_by_stac',
field=models.BooleanField(verbose_name='Managed by STAC'),
),
]
9 changes: 8 additions & 1 deletion app/distributions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,14 @@ def __str__(self) -> str:
package_distribution_id = CustomSlugField(
_(_context, "External ID"), unique=True, max_length=100
)
managed_by_stac = models.BooleanField(_(_context, "Managed by STAC"), max_length=100)
managed_by_stac = models.BooleanField(_(_context, "Managed by STAC"))

_legacy_imported = models.BooleanField(
_(_context, "Legacy Imported"),
db_index=False,
default=False,
help_text="This field is used to track objects imported from STAC"
)

dataset = models.ForeignKey(Dataset, on_delete=models.CASCADE)

Expand Down
Loading
Loading