feat: update nhm plugin to work with vds vNext

BREAKING CHANGE: update nhm plugin to work with vds vNext Additionally, this removes the record_show action from this plugin and moves it to the vds plugin (where it still can be called as record_show given it aligns well with ckan's naming conventions, but also can be called as vds_data_get to align with vds's naming conventions). There is also a new action, datastore_search, which chains the datastore_search action from vds (or others!) and adds the include_urls parameter.
NaturalHistoryMuseum · Sep 16, 2024 · 9d7d176 · 9d7d176
1 parent 3e6b1e4
commit 9d7d176
Show file tree

Hide file tree

Showing 28 changed files with 10,735 additions and 363 deletions.
diff --git a/ckanext/nhm/dcat/specimen_records.py b/ckanext/nhm/dcat/specimen_records.py
@@ -14,7 +14,6 @@
     Namespaces,
     object_uri,
     as_dwc_list,
-    epoch_to_datetime,
 )
 from ckanext.nhm.lib.dwc import dwc_terms
 from ckanext.nhm.lib.helpers import get_department
@@ -95,7 +94,7 @@ def __init__(
         self.version = version
 
         # figure out the rounded version of the record
-        self.rounded_version = toolkit.get_action('datastore_get_rounded_version')(
+        self.rounded_version = toolkit.get_action('vds_version_round')(
             {},
             {
                 'resource_id': self.record.resource_id,
@@ -261,7 +260,7 @@ def _cetaf_cspp(self):
             yield (
                 self.record_ref,
                 self.namespaces.dc.created,
-                Literal(epoch_to_datetime(self.record.data['created'])),
+                Literal(self.record.data['created']),
             )
         yield self.record_ref, self.namespaces.dc.publisher, URIRef('https://nhm.ac.uk')
 
@@ -381,15 +380,15 @@ def _dwc(self):
             yield (
                 self.record_ref,
                 self.namespaces.dc.created,
-                Literal(epoch_to_datetime(self.record.data['created'])),
+                Literal(self.record.data['created']),
             )
 
         if self.record.data.get('modified', None) is not None:
             # yield the modified date in the correct format
             yield (
                 self.record_ref,
                 self.namespaces.dwc.modified,
-                Literal(epoch_to_datetime(self.record.data['modified'])),
+                Literal(self.record.data['modified']),
             )
 
     def _version_info(self):

diff --git a/ckanext/nhm/dcat/utils.py b/ckanext/nhm/dcat/utils.py
@@ -4,11 +4,10 @@
 # This file is part of ckanext-nhm
 # Created by the Natural History Museum in London, UK
 
-from ckanext.nhm.lib.helpers import get_specimen_resource_id
-from datetime import datetime
 from rdflib import namespace
 
 from ckan.plugins import toolkit
+from ckanext.nhm.lib.helpers import get_specimen_resource_id
 
 
 def object_uri(uuid, version=None):
@@ -35,18 +34,7 @@ def as_dwc_list(objects):
     :param objects: the objects
     :return: a | separated string
     """
-    return ' | '.join(objects)
-
-
-def epoch_to_datetime(epoch_timestamp):
-    """
-    Converts the given epoch timestamp into a datetime object. The timestamp passed in
-    is assumed to be the integer number of milliseconds since the UNIX epoch.
-
-    :param epoch_timestamp: the integer number of milliseconds since the UNIX epoch
-    :return: a datetime object
-    """
-    return datetime.fromtimestamp(epoch_timestamp / 1000.0)
+    return " | ".join(objects)
 
 
 class Namespaces:

diff --git a/ckanext/nhm/lib/filter_options.py b/ckanext/nhm/lib/filter_options.py
@@ -5,6 +5,8 @@
 # Created by the Natural History Museum in London, UK
 
 from elasticsearch_dsl import Q
+from splitgill.indexing.fields import DocumentField
+from splitgill.search import exists_query, term_query
 
 
 class FilterOption:
@@ -43,18 +45,16 @@ def as_dict(self):
 
 
 # define some simple, common filters
-has_image = FilterOption(
-    '_has_image', 'Has image', Q('exists', field='data.associatedMedia')
-)
+has_image = FilterOption("_has_image", "Has image", exists_query("associatedMedia"))
 
 has_lat_long = FilterOption(
-    '_has_lat_long', 'Has lat/long', Q('exists', field='meta.geo')
+    "_has_lat_long", "Has lat/long", Q("exists", field=DocumentField.ALL_POINTS)
 )
 
 exclude_mineralogy = FilterOption(
-    '_exclude_mineralogy',
-    'Exclude Mineralogy',
+    "_exclude_mineralogy",
+    "Exclude Mineralogy",
     # note the ~ which inverts the query
-    ~Q('term', **{'data.collectionCode': 'min'}),
+    ~term_query("collectionCode", "min", case_sensitive=False),
     hide=True,
 )
diff --git a/ckanext/nhm/lib/helpers.py b/ckanext/nhm/lib/helpers.py
@@ -95,22 +95,28 @@ def get_record_count():
 
 
 @cache_region('collection_stats', 'record_stats')
-def get_record_stats():
-    start_version = 1501545600
-    end_version = int(time.time())
-    count_action = toolkit.get_action('datastore_count')
-
-    record_stats = []
+def get_record_stats() -> List[dict]:
+    """
+    Returns a list of dictionaries containing statistics about the number of records
+    available each week starting from 01/08/2017 and ending now.
 
-    for v in range(start_version, end_version, 604800):
-        record_stats.append(
-            {
-                'date': datetime.fromtimestamp(v),
-                'count': count_action({}, {'version': v * 1000}),
-            }
-        )
+    :return: a list of dicts
+    """
+    # 01/08/2017 as ms epoch
+    start_version = 1501545600000
+    # now as ms epoch
+    end_version = int(time.time() * 1000)
+    # 1 week in ms
+    step = 604800000
+    count_action = toolkit.get_action("vds_multi_count")
 
-    return record_stats
+    return [
+        {
+            'date': datetime.fromtimestamp(version / 1000),
+            'count': count_action({}, {'version': version})["total"],
+        }
+        for version in range(start_version, end_version, step)
+    ]
 
 
 def _get_action(action, params):
@@ -156,7 +162,7 @@ def get_record(resource_id, record_id):
     :param record_id: the ID of the record
     """
     record = _get_action(
-        'record_show', {'resource_id': resource_id, 'record_id': record_id}
+        "vds_data_get", {"resource_id": resource_id, "record_id": record_id}
     )
     return record.get('data', None)
 
@@ -278,7 +284,7 @@ def get_nhm_organisation_id():
 
     :returns: ID for the NHM organisation
     """
-    value = toolkit.config.get('ldap.organization.id')
+    value = toolkit.config.get("ldap.organization.id")
     return str(value) if value is not None else None
 
 
@@ -294,6 +300,7 @@ def is_collection_resource_id(resource_id: str) -> bool:
         get_artefact_resource_id(),
         get_indexlot_resource_id(),
         get_specimen_resource_id(),
+        get_sample_resource_id(),
     }
     return resource_id in resource_ids
 
@@ -319,32 +326,34 @@ def get_indexlot_resource_id():
 
 
 def get_artefact_resource_id():
-    '''
-    @return:  ID for artefact resource
-    '''
     return toolkit.config.get('ckanext.nhm.artefact_resource_id')
 
 
+def get_sample_resource_id() -> str:
+    return toolkit.config.get("ckanext.nhm.sample_resource_id")
+
+
 def get_beetle_iiif_resource_id():
     """
     Get the ID for the beetle IIIF resource.
 
     :return: the resource id
     """
-    value = toolkit.config.get('ckanext.nhm.beetle_iiif_resource_id')
+    value = toolkit.config.get("ckanext.nhm.beetle_iiif_resource_id")
     return str(value) if value is not None else None
 
 
-@cache_region('collection_stats', 'collection_stats')
+@cache_region("collection_stats", "collection_stats")
 def collection_stats():
     """
     Get collection stats, including collection codes and collection totals.
     """
     stats = {}
     collections = [
-        ('artefacts', get_artefact_resource_id()),
-        ('indexlots', get_indexlot_resource_id()),
-        ('specimens', get_specimen_resource_id()),
+        ("artefacts", get_artefact_resource_id()),
+        ("indexlots", get_indexlot_resource_id()),
+        ("specimens", get_specimen_resource_id()),
+        ("samples", get_sample_resource_id()),
     ]
 
     collections_total = 0
@@ -353,7 +362,7 @@ def collection_stats():
             'resource_id': resource_id,
             'limit': 0,
         }
-        stats[name] = toolkit.get_action('datastore_search')({}, params)['total']
+        stats[name] = toolkit.get_action("vds_basic_count")({}, params)
         collections_total += stats[name]
     stats['total'] = collections_total
 
@@ -375,7 +384,7 @@ def collection_stats():
                 }
             },
         }
-        total = toolkit.get_action('datastore_multisearch')({}, params)['total']
+        total = toolkit.get_action("vds_multi_count")({}, params)["total"]
         collection_code_counts.append((collection_code, total))
 
     collection_code_counts.sort(key=operator.itemgetter(1), reverse=True)
@@ -553,7 +562,7 @@ def get_resource_fields(resource, version=None, use_request_version=False):
         if '__version__' in filters:
             data['version'] = int(filters['__version__'][0])
 
-    result = toolkit.get_action('datastore_search')({}, data)
+    result = toolkit.get_action("vds_basic_query")({}, data)
     return [field['id'] for field in result.get('fields', [])]
 
 
@@ -1031,7 +1040,7 @@ def get_resource_facets(resource):
         if toolkit.h.get_param_int('_{}_limit'.format(field_name)) == 0:
             search_params.setdefault('facet_limits', {})[field_name] = 50
 
-    search = toolkit.get_action('datastore_search')(context, search_params)
+    search = toolkit.get_action("vds_basic_query")(context, search_params)
     facets = []
 
     # dictionary of facet name => formatter function with camel_case_to_string defined
@@ -1249,7 +1258,7 @@ def resource_view_get_filterable_fields(resource):
         'resource_id': resource['id'],
         'limit': 0,
     }
-    fields = toolkit.get_action('datastore_search')({}, data).get('fields', [])
+    fields = toolkit.get_action("vds_basic_query")({}, data).get("fields", [])
 
     # sort and filter the fields ensuring we only return string type fields and don't
     # return the id
@@ -1280,7 +1289,7 @@ def _get_latest_update(package_or_resource_dicts):
     """
     # a list of fields on the resource that should contain update dates
     fields = ['last_modified', 'revision_timestamp', 'Created']
-    get_rounded_version = toolkit.get_action('datastore_get_rounded_version')
+    get_rounded_version = toolkit.get_action("vds_version_round")
 
     latest_dict = None
     latest_date = None
@@ -1398,7 +1407,7 @@ def get_object_url(resource_id, guid, version=None, include_version=True):
     :return: the object url
     """
     if include_version:
-        rounded_version = toolkit.get_action('datastore_get_rounded_version')(
+        rounded_version = toolkit.get_action("vds_version_round")(
             {},
             {
                 'resource_id': resource_id,
@@ -1537,13 +1546,12 @@ def group_resources(resource_list):
 
 
 def get_resource_size(resource_dict):
-    prefixes = 'KMGTPEZ'  # kilo, mega, giga, etc. I could make this a list but what's the point
-    if toolkit.get_action('datastore_is_datastore_resource')(
+    if toolkit.get_action("vds_resource_check")(
         {}, {'resource_id': resource_dict['id']}
     ):
         try:
-            records = toolkit.get_action('datastore_count')(
-                {}, {'resource_ids': [resource_dict['id']]}
+            records = toolkit.get_action("vds_basic_count")(
+                {}, {'resource_id': resource_dict['id']}
             )
             return f'{records} records'
         except:
@@ -1572,7 +1580,7 @@ def get_record_permalink(
             'record_id': record_dict['_id'],
         }
         if include_version:
-            rounded_version = toolkit.get_action('datastore_get_rounded_version')(
+            rounded_version = toolkit.get_action("vds_version_round")(
                 {},
                 {
                     'resource_id': resource_dict['id'],

diff --git a/ckanext/nhm/lib/record.py b/ckanext/nhm/lib/record.py
@@ -36,7 +36,7 @@ def get_record_by_uuid(uuid, version=None) -> Optional['Record']:
                 'version': version,
             }
             # retrieve datastore record
-            search_result = toolkit.get_action('datastore_search')(
+            search_result = toolkit.get_action("vds_basic_query")(
                 context, search_data_dict
             )
             records = search_result['records']
@@ -183,7 +183,7 @@ def data(self) -> dict:
             data_dict = dict(record_id=self.id, resource_id=self.resource_id)
             if self.version is not None:
                 data_dict['version'] = self.version
-            self._data = toolkit.get_action('record_show')(self._context, data_dict)[
+            self._data = toolkit.get_action("vds_data_get")(self._context, data_dict)[
                 'data'
             ]
         return self._data
@@ -384,7 +384,8 @@ def geojson(self) -> Optional[dict]:
         extract the values from the record data and return a GeoJSON compatible Point
         where the record is located.
 
-        :return: None if the latitude and longitude couldn't be identified or a GeoJSON Point
+        :return: None if the latitude and longitude couldn't be identified or a GeoJSON
+                 Point
         """
         lat_field = self.resource.get(
             LATITUDE_FIELD, DWC_LATITUDE if self.is_dwc else None
@@ -396,12 +397,12 @@ def geojson(self) -> Optional[dict]:
         if not lat_field or not lon_field:
             return None
 
-        latitude = self.data.get(lat_field)
-        longitude = self.data.get(lon_field)
-
-        if latitude is None or longitude is None:
+        try:
+            latitude = float(self.data.get(lat_field))
+            longitude = float(self.data.get(lon_field))
+        except (ValueError, TypeError):
             return None
 
-        # create a piece of GeoJSON to point at the specific record location on a map (note the
-        # longitude then latitude ordering required by GeoJSON)
-        return dict(type='Point', coordinates=[float(longitude), float(latitude)])
+        # create a piece of GeoJSON to point at the specific record location on a map
+        # (note the longitude then latitude ordering required by GeoJSON)
+        return dict(type="Point", coordinates=[longitude, latitude])